In [1]:
import pandas as pd
import numpy as np
import scipy

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer

df = pd.read_csv('./data/csv/all_cleaned.csv')

In [3]:
text_feats = ['title', 'selftext']
num_and_bool_feats = [col for col in df.columns if col not in text_feats + ['subreddit']]

df_new = df[text_feats]
df_new = df_new.join(df[num_and_bool_feats].astype(float))
df_new = df_new.join(df[['subreddit']])

sample_median = df_new.score.median()
target = (df_new['score'] > sample_median).astype(int)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(df_new.drop('score', 1), target)

In [5]:
X_train.head()

Unnamed: 0,title,selftext,num_comments,over_18,locked,stickied,created_utc,is_self,subreddit
3909,Maybe Maybe Maybe,,14.0,0.0,0.0,0.0,1527481000.0,0.0,maybemaybemaybe
8736,Today marks 2 years of sobriety inspired by Pr...,Thank you President Trump. About 2.5 years ago...,27.0,0.0,0.0,0.0,1527525000.0,1.0,The_Donald
3386,Profile,,1.0,0.0,0.0,0.0,1527520000.0,0.0,KatherineMcNamara
4502,I miss scenes like this. Pray for S4 and more ...,,6.0,0.0,0.0,0.0,1527512000.0,0.0,lucifer
5245,Rustam Khabilov lands some spinning shit on Jo...,,31.0,0.0,0.0,0.0,1527513000.0,0.0,MMA


I really want to run a standard scaler on created time to see if that works out. 

Things that need to be saved for processing in the future.
> The standard scaler information.  
> The count vectorizers  
> The dummied variable names


The below should ***SHOULD*** convert a df into a sparse array.

In [6]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline, FeatureUnion
import pandas as pd

class GetDummies(TransformerMixin, BaseEstimator):
    def __init__(self, column_names=None):
        self.column_names = column_names
        
        if type(self.column_names) == str:
            self.column_names = {self.column_names}

    def transform(self, X, *_):
        
        result = []
        new_row_shape = (1, self.new_cols_)
        for index in X.index:
            new_row = scipy.sparse.dok_matrix(new_row_shape)
            for col in self.column_names:
                input_value = X.at[index,col]
                if input_value in self.columns_locator[col]:
                    col_label = self.columns_locator[col][input_value]
                    new_row[0,col_label] += 1
            result.append(new_row)
            
        result = scipy.sparse.vstack(result)
        result = pd.SparseDataFrame(result, default_fill_value=0, index=X.index)
    
        return result.join(X.drop(labels=self.column_names, axis=1))

    def fit(self, X, *_):
        #the key will be the column where the variable is encoded
        self.columns_locator = {}
        offset = 0 
        for col in self.column_names:
            self.columns_locator[col] = {value:(i+offset) for i,value in enumerate(set(X[col]))}
            offset += len(self.columns_locator[col])
        self.new_cols_ = offset

In [8]:
gd = GetDummies(['subreddit'])
gd.fit(X_train)

In [9]:
X_train_tr  = gd.transform(X_train).select_dtypes(exclude='O')
X_test_tr = gd.transform(X_test).select_dtypes(exclude='O')

In [11]:
pipe = Pipeline([
    ('logreg', LogisticRegression(random_state=41))
])

my_params = {
    'logreg__C':np.random.exponential(1,12)
}

g_logreg = GridSearchCV(pipe, param_grid = my_params)

g_logreg.fit(X_train_tr, y_train)
print(g_logreg.score(X_train_tr, y_train))
print(g_logreg.score(X_test_tr, y_test))

0.501260151218
0.501469970601
