# Import Libraries

In [125]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import requests
import re
import json
import time
import datetime
import math


from psaw import PushshiftAPI
import praw


from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score, train_test_split, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier

%matplotlib inline
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


import warnings
warnings.filterwarnings('ignore')

---

# Preprocessing

### Import data for 2 types of modeling :
1. Using logistic regression / KNN
2. Using NLP Naive Bayes

In [126]:
df_combined = pd.read_csv("/Users/mohammadiyliahaziq/Desktop/GA/dsi25-workspace/project_3/data/df_for_modelling_top2500.csv")
df_combined = df_combined.drop(columns='Unnamed: 0')

**We take the top 2500 posts in each category and proceed with logistic regression**
- first we convert `created_utc` column into `day` and `hour` columns
- then we onehotencode the target variable `subreddit` column

In [127]:
# converting and splitting the created_utc column into its
# respective month, day, hour columns
df_combined['created_utc'] = df_combined['created_utc'].map(lambda x:
                                                    datetime.datetime.fromtimestamp(x))

df_combined['day'] =  df_combined['created_utc'].dt.day
df_combined['hour'] =  df_combined['created_utc'].dt.hour

# dropping columns that are not required
df_combined = df_combined.drop(columns=['created_utc','title'])

In [128]:
# converting the target variable subreddit column into 1 and 0, where
# 1 indicates that the post is from r/news and 
# 0 indicates that the post is from r/TheOnion
df_combined.reset_index(drop=True, inplace=True)
df_combined['subreddit'].replace({'news':1,'TheOnion': 0}, inplace=True)

# shifting the target variable subreddit column to the last column 
temp = df_combined['subreddit']
df_combined = pd.concat((df_combined.drop(columns='subreddit'),temp),axis=1)


df_combined = df_combined.astype({'day':'object', 'hour':'object'})
df_combined = pd.get_dummies(df_combined, columns=['day','hour'])

**Dummify categorical variables**

___

# Modeling - Logistic Regression

In [129]:
# randomize the rows
# df_combined = df_combined.sample(frac=1).reset_index(drop=True)

# <span style="color:blue">QUESTION</span>
- fit_transform
- standard scaler ==> results drop
- does it help to show our model's performance on more unseen data?
- how to make sense of the Naive Bayes scores ==> how do we intepret the highest scoring features?
- should i use DT/RF/boosting if they all show 0.99?
- jupyter notebook organisation

In [130]:
# Why when dont apply standarscaler, results improve but result for external set is very poor?

# train test split
X_train, X_test, y_train, y_test = train_test_split(df_combined.drop(columns='subreddit'),
                                                   df_combined['subreddit'], test_size=0.2,
                                                   random_state=24)
# apply standardscaler
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

# dont apply standardscaler
# X_train_sc = X_train
# X_test_sc = X_test

print(X_train_sc.shape)
print(y_train.shape)

(4000, 58)
(4000,)


**deploy and evaluate model**
- QUESTION: WHY DOES STANDARD SCALER GIVE WORSE RESULT?
- w/o ss: train/test 0.96/0.95
- w ss: train/test 0.86/0.86

In [132]:
LogReg = LogisticRegression(solver='liblinear')
LogReg.fit(X_train_sc, y_train)
y_pred = LogReg.predict(X_test_sc)
print(LogReg.score(X_train_sc, y_train))
print(LogReg.score(X_test_sc, y_test))

0.96825
0.956


#### GridSearchCV for hyperparameter tuning

In [133]:
from sklearn.model_selection import GridSearchCV
# use GridSearchCV to find the hyperparameters that gives the best modelling scores 
LogReg = LogisticRegression()
LogReg = GridSearchCV(estimator=LogReg, param_grid={'C': [1,10,20], 'solver':['newton-cg','liblinear','lbfgs'],
                      'l1_ratio':[0.25,0.5,0.75]}, verbose = 1, cv =5, return_train_score = False)

# fit scaled train data into the Logistic Regression GridSearch model 
LogReg.fit(X_train_sc, y_train)
LogReg.cv_results_
# df = pd.DataFrame(LogReg.cv_results_)
# df = df[['mean_test_score','param_solver','param_l1_ratio','param_C']]
# df.sort_values(by='mean_test_score', ascending=False).head()
LogReg.best_params_

Fitting 5 folds for each of 27 candidates, totalling 135 fits


{'C': 1, 'l1_ratio': 0.25, 'solver': 'newton-cg'}

In [134]:
# apply the best hyperparameters which we got from GridSearch above into our model.
LogReg_GridSearch = LogisticRegression(C= 20, l1_ratio= 0.25, solver= 'newton-cg')
LogReg_GridSearch.fit(X_train_sc, y_train)
y_pred = LogReg_GridSearch.predict(X_test_sc)
print(LogReg_GridSearch.score(X_train_sc, y_train))
print(LogReg_GridSearch.score(X_test_sc, y_test))

0.9675
0.955


# Visualizing additional external datasets

In [135]:
df_news_testext = pd.read_csv('df_news.csv')
df_onion_testext = pd.read_csv('df_onion.csv')
df_news_testext['subreddit'] = 1
df_onion_testext['subreddit'] = 0
df_news_testext['num_char'] = df_news_testext['title'].map(lambda x: len(x))
df_onion_testext['num_char'] = df_onion_testext['title'].map(lambda x: len(x))
df_combined_testext = pd.concat((df_news_testext,df_onion_testext), axis=0)
df_combined_testext = df_combined_testext.sample(frac=1).reset_index(drop=True)
len(set(df_combined_testext.title))

1968

In [136]:
df_combined_testext = df_combined_testext.drop(columns='Unnamed: 0')

# converting
# and splitting the created_utc column into its
# respective month, day, hour columns
df_combined_testext['created_utc'] = df_combined_testext['created_utc'].map(lambda x:
                                                    datetime.datetime.fromtimestamp(x))

df_combined_testext['day'] =  df_combined_testext['created_utc'].dt.day
df_combined_testext['hour'] =  df_combined_testext['created_utc'].dt.hour

# dropping columns that are not required
df_combined_testext = df_combined_testext.drop(columns=['created_utc', 'selftext'])
df_combined_testext = df_combined_testext.astype({'day':'object', 'hour':'object'})
df_combined_testext = pd.get_dummies(df_combined_testext, columns=['day','hour'])

In [137]:
X = df_combined_testext.drop(columns=['title','subreddit'])
y = df_combined_testext['subreddit']

# X_sc = X
ss = StandardScaler()
X_sc = ss.fit_transform(X)
y_pred = LogReg_GridSearch.predict(X_sc)
print(LogReg_GridSearch.score(X_sc,y))

0.5230460921843687


___