### Imports

In [43]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from nltk.tokenize.casual import TweetTokenizer
import regex
import emoji
import pickle
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from sklearn.linear_model import LogisticRegression
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from skopt import BayesSearchCV
from scipy.stats import uniform, loguniform
from skopt.space import Integer, Real, Categorical

In [None]:
# reading csv files
# title = pd.read_csv('C:/Users/mmoli/GA/projects/project_3/data/title_only_cleaned.csv')
# selftext = pd.read_csv('C:/Users/mmoli/GA/projects/project_3/data/selftext_only_cleaned.csv')
both = pd.read_csv('C:/Users/mmoli/GA/projects/project_3/data/both_cleaned.csv')

In [None]:
both.head()

### Tokenizing-Lemmatiziation Function

In [28]:
def mytokenizing_lem(column):
    words = ''
    for elements in column:
        words += elements
    tokenizer = RegexpTokenizer(r'\w+')
    words_tokens = tokenizer.tokenize(words)
    lemmatizer = WordNetLemmatizer()
    words_tokens_lem = [lemmatizer.lemmatize(token) for token in words_tokens]
    return words_tokens_lem

In [4]:
mytokenizing_lem(both['title'])

['What',
 's',
 'the',
 'difference',
 'between',
 'a',
 'dog',
 'and',
 'a',
 'cat',
 'lick',
 'Why',
 'is',
 'it',
 'wierd',
 'to',
 'find',
 'a',
 'latin',
 'woman',
 'or',
 'a',
 'Latinas',
 'attractive',
 'What',
 's',
 'a',
 'healthy',
 'way',
 'to',
 'discipline',
 'myself',
 'Applying',
 'Lube',
 'Why',
 'don',
 't',
 'the',
 'Democrats',
 'make',
 'the',
 'Republicans',
 'ACTUALLY',
 'carry',
 'out',
 'a',
 'filibuster',
 'When',
 'discussing',
 'suicide',
 'and',
 'mental',
 'health',
 'why',
 'doe',
 'no',
 'one',
 'mention',
 'that',
 'woman',
 'attempt',
 'suicide',
 '3x',
 's',
 'the',
 'rate',
 'that',
 'men',
 'do',
 'What',
 'are',
 'the',
 'most',
 'positive',
 'subreddits',
 'out',
 'there',
 'Is',
 'it',
 'a',
 'normal',
 'feeling',
 'to',
 'be',
 'confused',
 'why',
 'someone',
 'you',
 'find',
 'attractive',
 'find',
 'you',
 'attractive',
 'How',
 'do',
 'i',
 'make',
 'digital',
 'estimate',
 'of',
 'wood',
 'project',
 'to',
 'get',
 'accurate',
 'measurement',

### Tokenizing-Stemming Function

In [25]:
def mytokenizing_stem(column):
    words = ''
    for elements in column:
        words += elements
    tokenizer = RegexpTokenizer(r'\w+')
    words_tokens = tokenizer.tokenize(words)
    p_stemmer = PorterStemmer()
    words_tokens_stem = [p_stemmer.stem(token) for token in words_tokens]
    return words_tokens_stem

In [48]:
mytokenizing_stem(both['title']);

['what',
 's',
 'the',
 'differ',
 'between',
 'a',
 'dog',
 'and',
 'a',
 'cat',
 'lick',
 'whi',
 'is',
 'it',
 'wierd',
 'to',
 'find',
 'a',
 'latin',
 'woman',
 'or',
 'a',
 'latina',
 'attract',
 'what',
 's',
 'a',
 'healthi',
 'way',
 'to',
 'disciplin',
 'myself',
 'appli',
 'lube',
 'whi',
 'don',
 't',
 'the',
 'democrat',
 'make',
 'the',
 'republican',
 'actual',
 'carri',
 'out',
 'a',
 'filibust',
 'when',
 'discuss',
 'suicid',
 'and',
 'mental',
 'health',
 'whi',
 'doe',
 'no',
 'one',
 'mention',
 'that',
 'women',
 'attempt',
 'suicid',
 '3x',
 's',
 'the',
 'rate',
 'that',
 'men',
 'do',
 'what',
 'are',
 'the',
 'most',
 'posit',
 'subreddit',
 'out',
 'there',
 'is',
 'it',
 'a',
 'normal',
 'feel',
 'to',
 'be',
 'confus',
 'whi',
 'someon',
 'you',
 'find',
 'attract',
 'find',
 'you',
 'attract',
 'how',
 'do',
 'i',
 'make',
 'digit',
 'estim',
 'of',
 'wood',
 'project',
 'to',
 'get',
 'accur',
 'measur',
 'dirt',
 'keep',
 'get',
 'under',
 'my',
 'extrem

### Text Edits

#### Emojis

In [6]:
# make use in a different model; not particularly useful

def split_count(array):
    emoji_list = []
    for text in array:
        data = regex.findall(r'\X', text)
        for word in data:
            if any(char in emoji.UNICODE_EMOJI['en'] for char in word):
                emoji_list.append(word)
    
    return emoji_list

https://stackoverflow.com/questions/43146528/how-to-extract-all-the-emojis-from-text

In [90]:
len(split_count(both['title']))

73

In [89]:
len(split_count(both['selftext']))

852

#### Contractions

In [29]:
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"it\'s", "its", phrase)
    phrase = re.sub(r"What\'s", "What is", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    # phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

# https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python

In [30]:
# expanding contracted text
for col in ['text']:
    both['text'] = [decontracted(each) for each in both[col]]

#### Stop Words

In [8]:
default_stop_words = list(CountVectorizer(stop_words = 'english').get_stop_words())
my_stop_words = ['abov', 'afterward', 'alon', 'alreadi', 'alway', 'ani', 'anoth', 'anyon', 'anyth', 'anywher', 'becam', 'becaus', 'becom', 'befor', 'besid', 'cri', 'describ', 'dure', 'els', 'elsewher', 'empti', 'everi', 'everyon', 'everyth', 'everywher', 'fifti', 'formerli', 'forti', 'ha', 'henc', 'hereaft', 'herebi', 'hi', 'howev', 'hundr', 'inde', 'latterli', 'mani', 'meanwhil', 'moreov', 'mostli', 'nobodi', 'noon', 'noth', 'nowher', 'onc', 'onli', 'otherwis', 'ourselv', 'perhap', 'pleas', 'seriou', 'sever', 'sinc', 'sincer', 'sixti', 'someon', 'someth', 'sometim', 'somewher', 'themselv', 'thenc', 'thereaft', 'therebi', 'therefor', 'thi', 'thu', 'togeth', 'twelv', 'twenti', 'veri', 'wa', 'whatev', 'whenc', 'whenev', 'wherea', 'whereaft', 'wherebi', 'wherev', 'whi', 'yourselv', 'anywh', 'becau', 'el', 'elsewh', 'everywh', 'ind', 'otherwi', 'plea', 'somewh'] 
my_stop_words_same = ['abov', 'afterward', 'alon', 'alreadi', 'alway', 'ani', 'anoth', 'anyon', 'anyth', 'anywher', 'becam', 'becaus', 'becom', 'befor', 'besid', 'cri', 'describ', 'dure', 'els', 'elsewher', 'empti', 'everi', 'everyon', 'everyth', 'everywher', 'fifti', 'formerli', 'forti', 'ha', 'henc', 'hereaft', 'herebi', 'hi', 'howev', 'hundr', 'inde', 'latterli', 'mani', 'meanwhil', 'moreov', 'mostli', 'nobodi', 'noon', 'noth', 'nowher', 'onc', 'onli', 'otherwis', 'ourselv', 'perhap', 'pleas', 'seriou', 'sever', 'sinc', 'sincer', 'sixti', 'someon', 'someth', 'sometim', 'somewher', 'themselv', 'thenc', 'thereaft', 'therebi', 'therefor', 'thi', 'thu', 'togeth', 'twelv', 'twenti', 'veri', 'wa', 'whatev', 'whenc', 'whenev', 'wherea', 'whereaft', 'wherebi', 'wherev', 'whi', 'yourselv'] 



custom_stop_words = default_stop_words + my_stop_words

### Modeling Pipes

In [9]:
# X = both['title']
# y = both['subreddit_nsq']

# X = both['selftext']
# y = both['subreddit_nsq']

X = both['text']
y = both['subreddit_nsq']

In [10]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24, stratify=y)

In [49]:
# Model dataframe
model_df = pd.DataFrame(columns=['Model', 'Train', 'Test'] )

#### Random Forests Classifiers

##### RFC Default/Test

In [12]:
pipe = Pipeline([
    ('cv', CountVectorizer()), 
    ('rfc', RandomForestClassifier()), 
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.9996, 0.6692)

In [201]:
pipe = Pipeline([
    ('tv', TfidfVectorizer(tokenizer=mytokenizing_stem)), 
    ('rfc', RandomForestClassifier()), 
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.9996, 0.6744)

##### RFC LEM

In [157]:
pipe = Pipeline([
    ('cv', CountVectorizer(tokenizer=mytokenizing_lem)), 
    ('rfc', RandomForestClassifier())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.9996, 0.6665)

##### RFC Stem

In [167]:
pipe = Pipeline([
    ('cv', CountVectorizer(tokenizer=mytokenizing_stem, stop_words=my_stop_words)), 
    ('rfc', RandomForestClassifier())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.9996, 0.6726)

##### Insights
* Stemming performs the best with RFC
* Overfit
* Best train test scores .99 .6726

#### Extra Trees Classifier

In [13]:
pipe = Pipeline([
    ('cv', CountVectorizer(tokenizer=mytokenizing_stem)), 
    ('efc', ExtraTreesClassifier(max_features=500))
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.9996, 0.6793)

#### Logistic Regression

In [39]:
pipe = Pipeline([
    ('cv', CountVectorizer(min_df=2, tokenizer=mytokenizing_stem)), 
    ('logreg', LogisticRegression(C=.003, random_state=24, max_iter=200))
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.7045333333333333, 0.6857)

##### Insights
* train test scores .7309 .6886

#### Grid Search Logreg Pipe

In [88]:
pipe_params = {
    'cv__stop_words': [None],
    'cv__min_df': [1, 2, 3],
    'cv__max_df': [1.0, .95],
    'cv__tokenizer': [None, mytokenizing_lem, mytokenizing_stem],
    'logreg__C': [.0001, .001, .01, .1, 1, 10],
    'logreg__random_state': [24],
    'logreg__max_iter': [100, 150, 200, 250],
    'logreg__class_weight': [None, 'balanced'],
    'logreg__max_iter':[200, 400, 600]
}

In [None]:
gs = GridSearchCV(pipe,
                  param_grid=pipe_params,
                  cv=5)

gs.fit(X_train, y_train)

gs.best_score_

gs.best_params_

#### Randomized Search Logreg Pipe

In [86]:
# Random search
rscv = RandomizedSearchCV(estimator = pipe,
                     param_distributions = pipe_params,
                     scoring = 'f1_weighted',
                     n_iter = 100,
                     n_jobs = -2,
                     cv = 5,
                     verbose = 1)

# Fit our model
rscv.fit(X_train, y_train)

# Results
print("RANDOMIZED SEARCH")
print(f"Best score: {rscv.best_score_}")
print(f"Best params: {rscv.best_params_}")
print()

Fitting 5 folds for each of 100 candidates, totalling 500 fits
RANDOMIZED SEARCH
Best score: 0.6796131699911369
Best params: {'logreg__random_state': 24, 'logreg__max_iter': 600, 'logreg__class_weight': 'balanced', 'logreg__C': 0.01, 'cv__tokenizer': <function mytokenizing_stem at 0x0000028C07D04EE0>, 'cv__stop_words': None, 'cv__min_df': 2, 'cv__max_df': 0.95}



#### Bayesian Search SVC

In [45]:
%%time
# Same old pipeline
bs_pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('svc', SVC(random_state = 24))
])

# Bayes params -- distributions!
bs_params = {
    'cv__stop_words': [None, 'english'],
    'cv__min_df': [1, 2, 3],
    'cv__max_df': [1.0, 5.0],
    'cv__tokenizer': [mytokenizing_stem],
    'svc__C': np.logspace(-5,2, 10), #Real(1e-5, 1e+2, prior='log-uniform'),  was: loguniform(1e-5,1e+2), # was: np.logspace(-5,2, 10),
    'svc__kernel': Categorical(['poly','rbf']),
    'svc__gamma': Categorical(['scale','auto']),
    'svc__degree': Integer(2,10), # can now sample all integer values freely
    'svc__coef0': Real(0,1, prior='uniform'), # was: np.linspace(0,1, 5),
    'svc__shrinking': Categorical([True, False])  
}

# Bayes hyperparameter search
svc_bs = BayesSearchCV(estimator = bs_pipe,
                     search_spaces = bs_params,
                     scoring = 'f1_weighted',
                     n_iter = 15,
                     cv = 5,
                     verbose = 1,
                     random_state=24,
                     n_jobs=10,
                        n_points=2
                      )

svc_bs.fit(X_train, y_train);
print(f"Best score: {svc_bs.best_score_}")
print(f"Best parameters: {svc_bs.best_params_}")

# credit to hyperparameters breakfasthour

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Best score: 0.6807178116482109
Best parameters: OrderedDict([('cv__max_df', 3.9426340945991716), ('cv__min_df', 2), ('cv__stop_words', 'english'), ('cv__tokenizer', <function mytokenizing_stem at 0x0000023FAFB77B80>), ('svc__C', 2.782559402207126), ('svc__coef0', 0.014961257124677333), ('svc__degree', 4), ('svc__gamma', 'scale'), ('svc__kernel', 'rbf'), ('svc__shrinking', True)])
Wall time: 10h 43min 44s


In [None]:
# when exponentiating makes sure to divide log coefficient by standard deviation (works out in the math)
# or can do np.exp(logreg.coeff_/ss.scale_) or np.exp(log odds coefficient)**(1/std dev)

##### TfidVectorizer

In [182]:
pipe = Pipeline([
    ('tv', TfidfVectorizer(tokenizer=mytokenizing_stem)), 
    ('logreg', LogisticRegression(C=.01, random_state=24, max_iter=1000))
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.6607666666666666, 0.6501)

#### Ada Booster

In [192]:
pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('abc', AdaBoostClassifier(random_state=24, n_estimators=250))
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.6941333333333334, 0.6744)

#### Gradient Booster

In [80]:
pipe = Pipeline([
    ('cv', CountVectorizer(tokenizer=mytokenizing_stem)),
    ('gb', GradientBoostingClassifier(random_state=24, n_estimators=250))
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.7074333333333334, 0.6858)

##### Insights:
* .7045 .6795
* .7003 .6839

#### XGBoost

In [200]:
pipe = Pipeline([
    ('cv', CountVectorizer(tokenizer=mytokenizing_stem)),
    ('xgb', XGBClassifier())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)





(0.7867666666666666, 0.683)

#### KNN

In [202]:
pipe = Pipeline([
    ('cv', CountVectorizer(min_df=2, tokenizer=mytokenizing_stem)),
    ('knn', KNeighborsClassifier(n_neighbors=200))
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.6212333333333333, 0.6114)

#### Multinomial NaiveBayes

In [57]:
pipe = Pipeline([
    ('cv', CountVectorizer(min_df=2, tokenizer=mytokenizing_stem, max_features=8_000)),
    ('mnb', MultinomialNB())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.7253333333333334, 0.6886)

In [67]:
pipe = Pipeline([
    ('cv', CountVectorizer(min_df=2, tokenizer=mytokenizing_stem, max_features=8_000)),
    ('mnb', MultinomialNB(alpha=10))
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.7143, 0.688)

.7305 .6912

In [12]:
with open('C:/Users/mmoli/GA/projects/project_3/data/pickles/mnb.pkl', 'wb') as pickle_out:
    pickle.dump(pipe, pickle_out)

In [205]:
pipe = Pipeline([
    ('cv', CountVectorizer(min_df=2, tokenizer=mytokenizing_stem)),
    ('mnb', MultinomialNB())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.7377666666666667, 0.691)

.7377 .691

#### Support Vector Machine

In [50]:
pipe = Pipeline([
    ('cv', CountVectorizer(min_df=2, tokenizer=mytokenizing_stem)),
    ('svc', SVC(C=2.78))
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.85, 0.6859)

In [None]:
%%time
gs_pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('svc', SVC(random_state = 24))
])

gs_params = {
    'svc__C': np.logspace(-5,2, 10),
    'svc__kernel': ['poly','rbf'],
    'svc__gamma': ['scale','auto'],
    'svc__degree': np.linspace(2,10,9),
    'svc__coef0': np.linspace(0,1, 5),
    'svc__shrinking': [True, False],   
}

svc_gs = GridSearchCV(estimator = gs_pipe,
                     param_grid = gs_params,
                     scoring = 'f1_weighted',
                     cv = 5,
                     n_jobs = -2,
                     verbose = 1)

# Fit model
svc_gs.fit(X_train, y_train)

print(f"Best score: {svc_gs.best_score_}")
print(f"Best params: {svc_gs.best_params_}")

Fitting 5 folds for each of 3600 candidates, totalling 18000 fits


In [None]:
%%time
rs_pipe = Pipeline([
    ('ss', CountVectorizer()),
    ('svc', SVC(random_state = 24))
])

# Hyperparameters -- distributions!
rs_params = {
    'svc__C': np.logspace (-5, 2, 10), #loguniform(1e-5,1e+2), # was: np.logspace(-5,2, 10),
    'svc__kernel': ['poly','rbf'],
    'svc__gamma': ['scale','auto'],
    'svc__degree': list(np.linspace(2,10,9)), # same as before because we need integers
    'svc__shrinking': [True, False],   
}

# Random search
svc_rs = RandomizedSearchCV(estimator = rs_pipe,
                     param_distributions = rs_params,
                     scoring = 'f1_weighted',
                     n_iter = 50,
                     n_jobs = -2,
                     cv = 5,
                     verbose = 1)

# Fit our model
svc_rs.fit(X_train, y_train)

# Results
print("RANDOMIZED SEARCH")
print(f"Best score: {svc_rs.best_score_}")
print(f"Best params: {svc_rs.best_params_}")
print()

Fitting 5 folds for each of 50 candidates, totalling 250 fits


#### Modeling Insights
* Models hit a test score limitation between .68 and .69; limitation of data or not enough featuring engineering to emphasize differences between NSQ and TATA

In [58]:
model_df['Model'] = ['Random Forest Classifier', 'Extra Trees Classifier', 'Logisitic Regression', 'Adaptive Booster', 'Gradient Booster', 'Extreme Gradient Booster', 'K-Nearest Neighbors', 'Multinomial Naive Bayes', 'Support Vector Machine']
model_df['Train'] = [.9996, .9996, .7045, .6941, .7074, .7867, .6212, .7377, .8500]
model_df['Test'] = [.6692, .6793, .6857, .6744, .6858, .6830, .6114, .6910, .6859]

In [63]:
models = model_df.sort_values(by='Test', ascending=False)

In [64]:
models

Unnamed: 0,Model,Train,Test
7,Multinomial Naive Bayes,0.7377,0.691
8,Support Vector Machine,0.85,0.6859
4,Gradient Booster,0.7074,0.6858
2,Logisitic Regression,0.7045,0.6857
5,Extreme Gradient Booster,0.7867,0.683
1,Extra Trees Classifier,0.9996,0.6793
3,Adaptive Booster,0.6941,0.6744
0,Random Forest Classifier,0.9996,0.6692
6,K-Nearest Neighbors,0.6212,0.6114


#### Sentiment Analyzer

In [16]:
# instantiating 
sia = SentimentIntensityAnalyzer()

# scores for each string
scores = [sia.polarity_scores(each) for each in both['text']]

# df of scores
score_total = pd.DataFrame(scores)

# merging sia scores with both dataframe
both = both.merge(score_total, left_index=True, right_index=True)

In [17]:
# title word count
both['title_word_count'] = both['title'].map(str.split).map(len)

# title length
both['title_length'] = both['title'].map(len)

# selftext word count
both['selftext_word_count'] = both['selftext'].map(str.split).map(len)

# selftext length
both['selftext_length'] = both['selftext'].map(len)

In [18]:
# total word count (least efficient way)
word_count = []
length_count = []
for i in range(0, len(both)):
    x = both['title_word_count'][i] + both['selftext_word_count'][i]
    word_count.append(x)
    y = both['title_length'][i] + both['selftext_length'][i]
    length_count.append(y)

both['total_word_count'] = word_count

both['total_length'] = length_count

In [19]:
# emoji function altered for counts
def emoji_num_counter(array):
    emoji_list = []
    count_list = []
    for text in array:
        data = regex.findall(r'\X', text)
        for word in data:
            emoji_counter = 0
            if any(char in emoji.UNICODE_EMOJI['en'] for char in word):
                emoji_list.append(word)
                emoji_counter +=1
            count_list.append(emoji_counter)
    return sum(count_list)

In [20]:
# emoji count column
for col in ['text']:
    new_list = []
    for each in both['text']:
        new_list.append(emoji_num_counter(each))

both['emoji_count'] = new_list

#### Logistic Regression Using Non-NLP Methods

In [21]:
# correlation of non nlp features
both.drop(columns=['title', 'selftext', 'text']).corr()[['subreddit_nsq']]

Unnamed: 0,subreddit_nsq
subreddit_nsq,1.0
created_utc,0.622564
neg,-0.13892
neu,0.153235
pos,-0.059483
compound,0.052739
title_word_count,-0.002188
title_length,-0.002417
selftext_word_count,-0.140096
selftext_length,-0.136358


In [264]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
X = both[['created_utc', 'neg', 'neu', 'compound', 'total_word_count', 'total_length']]
y = both['subreddit_nsq']

In [319]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24, stratify=y)

In [320]:
pipe = Pipeline([
    ('ms', StandardScaler()),
    ('etc', GradientBoostingClassifier(random_state=24, n_estimators=250))
]) 

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.8684333333333333, 0.8573)

In [None]:
# models capping off at the same spot --- indication that that's as good as it gets
# Next steps:
# confusion matrix: where are the errors? how to optimize for that--look at distribution
# 

#### Insights:
* Surprisingly created_utc has the best correlation with subreddits
* Sia has limited value to modeling and can be excluded
* The model above reflects that classification for this data set is better and more efficiently achieved by using utc than the actual text