In [2]:
import pandas as pd
import numpy as np
import scipy

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer

df = pd.read_csv('../data/csv/all_cleaned.csv')

In [3]:
text_feats = ['title', 'selftext']
num_and_bool_feats = [col for col in df.columns if col not in text_feats + ['subreddit']]

df_new = df[text_feats]
df_new = df_new.join(df[num_and_bool_feats].astype(float))
df_new = df_new.join(df[['subreddit']])

sample_median = df_new.score.median()
target = (df_new['score'] > sample_median).astype(int)


In [None]:
sizes = {}
for subreddit in df['subreddit'].unique():
    size = df[df['subreddit'] == subreddit].shape[0]
    sizes[subreddit] = size
max_size = max(sizes.values())

X = {col:[] for col in df.drop('subreddit',1).columns}
y = []
for subreddit in df['subreddit'].unique():
    sub_df = df[df['subreddit'] == subreddit]
    np.random.seed(41)
    sample_index = np.random.choice(sub_df.index, size=max_size, replace=True)
    for index in sample_index:
        for col in d_tmp:
            X[col].append(df.at[index, col])
    y_sub = [subreddit for _ in sample_index]
    y.extend(y_sub)



In [69]:

# X_train, X_test, y_train, y_test = train_test_split(df_new.drop('score', 1), target)
X_train, X_test, y_train, y_test = train_test_split(
    df_new.drop(['score', 'subreddit','title','selftext', 'over_18','locked'], 1),
    target,
    random_state=41)

In [70]:
X_train.head()

Unnamed: 0,num_comments,stickied,created_utc,is_self
8575,7.0,0.0,1527496000.0,0.0
1222,24.0,0.0,1527527000.0,0.0
845,4.0,0.0,1527512000.0,0.0
4297,2.0,0.0,1527525000.0,0.0
1444,61.0,0.0,1527511000.0,0.0


I really want to run a standard scaler on created time to see if that works out. 

In [64]:
ss = StandardScaler()
cols_scale = ['num_comments', 'created_utc']

ss.fit(X_train[cols_scale])


StandardScaler(copy=True, with_mean=True, with_std=True)

In [71]:
X_train[cols_scale] = pd.DataFrame(data=ss.transform(X_train[cols_scale]),
                                   columns= cols_scale,
                                   index=X_train.index
                                  )
X_test[cols_scale] = pd.DataFrame(data=ss.transform(X_test[cols_scale]),
                                   columns= cols_scale,
                                   index=X_test.index
                                  )


In [72]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [73]:
logreg.score(X_train, y_train), logreg.score(X_test, y_test), 

(0.80509661159339119, 0.80344393112137757)

In [74]:
logreg.coef_

array([[ 8.79104937, -2.13063615, -2.31611227, -0.95614763]])

In [51]:
X_test.head()

Unnamed: 0,num_comments,created_utc
4244,-0.118688,1.248023
4026,-0.120756,0.988026
6158,-0.122825,0.519181
4273,-0.042157,0.295596
3089,-0.102141,1.230973


Things that need to be saved for processing in the future.
> The standard scaler information.  
> The count vectorizers  
> The dummied variable names


The below should ***SHOULD*** convert a df into a sparse array.

In [6]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline, FeatureUnion
import pandas as pd

class GetDummies(TransformerMixin, BaseEstimator):
    def __init__(self, column_names=None):
        self.column_names = column_names
        
        if type(self.column_names) == str:
            self.column_names = {self.column_names}

    def transform(self, X, *_):
        
        result = []
        new_row_shape = (1, self.new_cols_)
        for index in X.index:
            new_row = scipy.sparse.dok_matrix(new_row_shape)
            for col in self.column_names:
                input_value = X.at[index,col]
                if input_value in self.columns_locator[col]:
                    col_label = self.columns_locator[col][input_value]
                    new_row[0,col_label] += 1
            result.append(new_row)
            
        result = scipy.sparse.vstack(result)
        result = pd.SparseDataFrame(result, default_fill_value=0, index=X.index)
    
        return result.join(X.drop(labels=self.column_names, axis=1))

    def fit(self, X, *_):
        #the key will be the column where the variable is encoded
        self.columns_locator = {}
        offset = 0 
        for col in self.column_names:
            self.columns_locator[col] = {value:(i+offset) for i,value in enumerate(set(X[col]))}
            offset += len(self.columns_locator[col])
        self.new_cols_ = offset

In [8]:
gd = GetDummies(['subreddit'])
gd.fit(X_train)

In [9]:
X_train_tr  = gd.transform(X_train).select_dtypes(exclude='O')
X_test_tr = gd.transform(X_test).select_dtypes(exclude='O')

In [11]:
pipe = Pipeline([
    ('logreg', LogisticRegression(random_state=41))
])

my_params = {
    'logreg__C':np.random.exponential(1,12)
}

g_logreg = GridSearchCV(pipe, param_grid = my_params)

g_logreg.fit(X_train_tr, y_train)
print(g_logreg.score(X_train_tr, y_train))
print(g_logreg.score(X_test_tr, y_test))

0.501260151218
0.501469970601
