In [73]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import export_text, DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [74]:
cleaned_title = pd.read_csv('./data/cleaned_data.csv')
print(cleaned_title.shape)
cleaned_title.head(3)

(9661, 14)


Unnamed: 0,title,score,subreddit,url,num_comments,body,created,date,abv_median,title_len,body_len,external_link,title_clean,stem_string
0,In what condition would the atmosphere have to...,0,climatechange,https://www.reddit.com/r/climatechange/comment...,5,What is currently happening in Earth is it's ...,1661955000.0,2022-08-31 14:10:44,0,91,195,0,in what condition would the atmosphere have to...,in what condit would the atmospher have to be ...
1,Has anyone here been personally affected by a ...,26,climatechange,https://www.reddit.com/r/climatechange/comment...,21,"With what’s happened in Pakistan, ongoing wild...",1661933000.0,2022-08-31 08:07:27,1,69,217,0,has anyone here been personally affected by a ...,ha anyon here been person affect by a major cl...
2,A prolonged and record heat wave builds over t...,54,climatechange,https://www.cnn.com/2022/08/30/weather/record-...,10,0,1661902000.0,2022-08-30 23:33:35,1,69,1,1,a prolonged and record heat wave builds over t...,a prolong and record heat wave build over the ...


In [75]:
cleaned_title.isnull().sum()

title            0
score            0
subreddit        0
url              0
num_comments     0
body             0
created          0
date             0
abv_median       0
title_len        0
body_len         0
external_link    0
title_clean      3
stem_string      3
dtype: int64

In [76]:
# Dropping null values in 'title_clean'
i = cleaned_title[(cleaned_title.title_clean.isnull())].index
cleaned_title.drop(i, inplace=True)

In [77]:
cleaned_title.isnull().sum()

title            0
score            0
subreddit        0
url              0
num_comments     0
body             0
created          0
date             0
abv_median       0
title_len        0
body_len         0
external_link    0
title_clean      0
stem_string      0
dtype: int64

# Modeling:

## Title text:

### CountVectorizer:

In [39]:
# random_forest_params = {
#     'max_depth' : [None, 5, 10],
#     'min_samples_leaf': [2, 3, 5],
#     'n_estimators': [100, 125, 150]
    
# }

# gs = GridSearchCV(
#     RandomForestClassifier(), param_grid=random_forest_params, verbose=1)

# gs.fit(X_train_cv, y_train)

# print(gs.best_score_, gs.best_params_)

# gs.score(X_train_cv, y_train), gs.score(X_test_cv, y_test)

**results** from Gridsearch: Fitting 5 folds for each of 27 candidates, totalling 135 fits

0.5822045152722444 {'max_depth': None, 'min_samples_leaf': 3, 'n_estimators': 100}

(0.7698539176626826, 0.5828685258964144)


In [78]:
X = cleaned_title['stem_string']
y = cleaned_title['abv_median']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=.75, random_state=42)

In [79]:
# cv = CountVectorizer(stop_words='english', binary=True, ngram_range=(1,2))
cv= CountVectorizer()
cv.fit(X_train)

X_train_cv = cv.transform(X_train)
X_test_cv = cv.transform(X_test)

In [80]:
dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(X_train_cv, y_train)
dtc.score(X_train_cv, y_train), dtc.score(X_test_cv, y_test)

(0.995858069860555, 0.553623188405797)

In [81]:
rfc = RandomForestClassifier(random_state=42, min_samples_leaf=3, n_estimators=150)
rfc.fit(X_train_cv, y_train)
rfc.score(X_train_cv, y_train), rfc.score(X_test_cv, y_test)

(0.7640480463896175, 0.6037267080745342)

In [82]:
abc = AdaBoostClassifier(random_state=42, n_estimators=150)

abc.fit(X_train_cv, y_train)
abc.score(X_train_cv, y_train), abc.score(X_test_cv, y_test)

(0.6584288278337705, 0.5991718426501035)

scores from text_clean, ngram_range(1,2)

* DTC: .996 .531
* RFC: .765 .598
* ABC: .676 .603

scores from text_clean, stop_words='english', binary=True, ngram_range=(1,2):

* DTC: .993 .558
* RFC: .756 .580
* ABC: .646 .555

scores from stem_string, all default:

* DTC: .996 .549
* RFC: .765 .604
* ABC: .670 .600

scores from stem_string, ngram_range=(1,2)

* DTC: .996 .548
* RFC: .770 .609
* ABC: .674 .596

In [83]:
pipe = Pipeline([
    ('cv', CountVectorizer(ngram_range=(1,2))),
    ('lr', LogisticRegression(max_iter=1000)),
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.9866077592157946, 0.5900621118012422)

### TfidVectorizer:

In [84]:
# X = data_clean['title_clean']
# y = data_clean['abv_median']

X = cleaned_title['stem_string']
y = cleaned_title['abv_median']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=.75, random_state=42)

In [85]:
tf = TfidfVectorizer(ngram_range=(1,2)) #stop_words="english", 

# fit & transform
tf.fit(X_train)
X_train_tf = tf.transform(X_train)
X_test_tf = tf.transform(X_test)

In [86]:
text_tf = pd.DataFrame(X_train_tf.A, columns=tf.get_feature_names_out())

In [87]:
text_tf.shape

(7243, 46565)

In [88]:
text_tf.sum().sort_values().head(5)

peak but        0.137683
declin the      0.137683
climat will     0.137683
peak when       0.137683
start declin    0.137683
dtype: float64

In [89]:
text_tf.sum().sort_values().tail()

to              150.790458
climat chang    180.517322
chang           189.651072
the             192.111240
climat          210.027476
dtype: float64

In [90]:
dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(X_train_tf, y_train)
dtc.score(X_train_tf, y_train), dtc.score(X_test_tf, y_test)

(0.995858069860555, 0.5469979296066253)

In [91]:
rfc = RandomForestClassifier(random_state=42, min_samples_leaf=3, n_estimators=150)
rfc.fit(X_train_tf, y_train)
rfc.score(X_train_tf, y_train), rfc.score(X_test_tf, y_test)

(0.8290763495789037, 0.5763975155279503)

In [92]:
abc = AdaBoostClassifier(random_state=42, n_estimators=150)

abc.fit(X_train_tf, y_train)
abc.score(X_train_tf, y_train), abc.score(X_test_tf, y_test)

(0.6882507248377744, 0.5797101449275363)

In [142]:
# Inspiration from: https://towardsdatascience.com/multi-class-text-classification-model-comparison-and-selection-5eb066197568
X = cleaned_title['stem_string']
y = cleaned_title['abv_median']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=.75, random_state=42)

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [143]:
%%time
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.5937888198757764
              precision    recall  f1-score   support

           0       0.62      0.59      0.60      1267
           1       0.57      0.60      0.58      1148

    accuracy                           0.59      2415
   macro avg       0.59      0.59      0.59      2415
weighted avg       0.59      0.59      0.59      2415

CPU times: user 31.8 ms, sys: 2.03 ms, total: 33.9 ms
Wall time: 32.6 ms


In [145]:
logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(max_iter=200, n_jobs=1)),
               ])

logreg.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(max_iter=200, n_jobs=1))])

In [146]:
logreg.score(X_train, y_train), logreg.score(X_test, y_test)

(0.7690183625569516, 0.6037267080745342)

In [127]:
%%time

y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.6037267080745342
              precision    recall  f1-score   support

           0       0.62      0.65      0.63      1267
           1       0.59      0.55      0.57      1148

    accuracy                           0.60      2415
   macro avg       0.60      0.60      0.60      2415
weighted avg       0.60      0.60      0.60      2415

CPU times: user 34 ms, sys: 4.06 ms, total: 38 ms
Wall time: 41.2 ms


In [128]:
rfc = RandomForestClassifier(random_state=42, min_samples_leaf=3, n_estimators=150)
rfc.fit(X_train_cv, y_train)
rfc.score(X_train_cv, y_train), rfc.score(X_test_cv, y_test)

(0.7640480463896175, 0.6037267080745342)

## Other (non-text) Features:

In [99]:
cleaned_title.columns

Index(['title', 'score', 'subreddit', 'url', 'num_comments', 'body', 'created',
       'date', 'abv_median', 'title_len', 'body_len', 'external_link',
       'title_clean', 'stem_string'],
      dtype='object')

In [101]:
features = ['score', 'title_len', 'body_len', 'external_link']

In [129]:
X = cleaned_title[features]
y = cleaned_title['abv_median']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.75, random_state=42)

In [130]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

feat_logreg = Pipeline([('ss', StandardScaler()),
                ('pf', PolynomialFeatures(include_bias=True)),
                ('lr', LogisticRegression(max_iter=200, n_jobs=1)),
               ])

In [131]:
feat_logreg.fit(X_train, y_train)
feat_logreg.score(X_train, y_train), feat_logreg.score(X_test, y_test)

(0.6487643241750656, 0.6140786749482402)

In [132]:
y_pred = feat_logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.6140786749482402
              precision    recall  f1-score   support

           0       0.61      0.70      0.65      1252
           1       0.62      0.52      0.56      1163

    accuracy                           0.61      2415
   macro avg       0.61      0.61      0.61      2415
weighted avg       0.61      0.61      0.61      2415



## Text AND other features:

In [152]:
from sklearn.compose import ColumnTransformer
numeric_features = ['title_len', 'body_len', 'score', 'external_link']
numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

text_features = ['stem_string']
text_transformer = Pipeline(
    steps=[('vect', CountVectorizer(ngram_range=(1,2))),
                ('tfidf', TfidfTransformer())]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("text", text_transformer, text_features),
    ]
)


In [153]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(max_iter=200))]
)

X = cleaned_title[['title_len', 'body_len', 'score', 'external_link', 'stem_string']]
y = cleaned_title['abv_median']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.75, random_state=42)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 7243 and the array at index 1 has size 1