# Import Libraries

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import requests
import re
import json
import time
import datetime
import math
import nltk
import string


from psaw import PushshiftAPI
import praw


from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_val_score, train_test_split, cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay, plot_confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB 

%matplotlib inline
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)



import warnings
warnings.filterwarnings('ignore')

---

# Pre-processing text data

In [6]:
# import dataset from LogReg and NB
df_log = pd.read_csv('X_test_results_LogReg.csv')
df_nb = pd.read_csv('X_test_results_NB.csv')

In [7]:
# creating array for index rows where Log Reg prediction was wrong
incorrect_pred = np.where((df_log['y_test']!=df_log['y_pred']))
incorrect_pred
df_log['logreg_result'] = 0

for i in range(len(df_log)):
    if i in incorrect_pred[0]:
        df_log.loc[i,'logreg_result'] = 'wrong'
    else:
        df_log.loc[i,'logreg_result'] = 'correct'
        
# creating array for index rows where Naive Bayes prediction was wrong
incorrect_pred = np.where((df_nb['y_test']!=df_nb['y_pred']))
incorrect_pred
df_nb['nb_result'] = 0

for i in range(len(df_nb)):
    if i in incorrect_pred[0]:
        df_nb.loc[i,'nb_result'] = 'wrong'
    else:
        df_nb.loc[i,'nb_result'] = 'correct'

In [8]:
# renaming column names in both dataframes
df_log.rename(columns={'Unnamed: 0':'index',
                      'y_test':'y_test_logreg',
                      'y_pred':'y_pred_logreg'}, inplace=True)

df_nb.rename(columns={'Unnamed: 0':'index',
                      'y_test':'y_test_nb',
                      'y_pred':'y_pred_nb'}, inplace=True)

In [9]:
# merge dataframes on "index"
df_comparison = pd.merge(df_nb, df_log[['index','y_test_logreg','y_pred_logreg','title','logreg_result']], on='index')

In [10]:
df_comparison.head()

Unnamed: 0,index,y_test_nb,y_pred_nb,title_x,nb_result,y_test_logreg,y_pred_logreg,title_y,logreg_result
0,1941,0,0,"Onion Social CEO Appears Before Hague Tribunal To Be Tried For Crimes Against Humanity, Promote New Website Features",correct,0,0,"Onion Social CEO Appears Before Hague Tribunal To Be Tried For Crimes Against Humanity, Promote New Website Features",correct
1,4088,0,0,Flustered Mathematician Unable To Recommend Good Number,correct,0,0,Flustered Mathematician Unable To Recommend Good Number,correct
2,4564,1,1,Judge loosens Apple’s grip on app store in Epic decision,correct,1,0,Judge loosens Apple’s grip on app store in Epic decision,wrong
3,2075,1,1,"Chinese ‘crackdown’ on tech IPOs could lead to US share delistings, experts warn",correct,1,1,"Chinese ‘crackdown’ on tech IPOs could lead to US share delistings, experts warn",correct
4,3,0,1,"Middle-Aged Woman Angrily Demanding Price Check Was Once Carefree Youth, Onlookers Speculate",wrong,0,0,"Middle-Aged Woman Angrily Demanding Price Check Was Once Carefree Youth, Onlookers Speculate",correct


In [11]:
# number of rows where BOTH Naive Bayes and Log Reg predicted wrongly
len(df_comparison[(df_comparison['nb_result']=='wrong') & (df_comparison['logreg_result']=='wrong')])

14

In [12]:
# number of rows where Naive Bayes predicted correctly but Log Reg predicted wrongly
len(df_comparison[(df_comparison['nb_result']=='correct') & (df_comparison['logreg_result']=='wrong')])

79

In [13]:
# number of rows where Log Reg predicted correctly but Naive Bayes predicted wrongly
len(df_comparison[(df_comparison['nb_result']=='wrong') & (df_comparison['logreg_result']=='correct')])

93

In [14]:
# number of rows where Naive Bayes predicted wrongly
len(df_comparison[df_comparison['nb_result']=='wrong'])

107

In [15]:
# number of rows where Log Reg predicted wrongly
len(df_comparison[df_comparison['logreg_result']=='wrong'])

93

### Comparison dataframe

In [16]:
# some random cleaning 
df_comparison['subreddit'] = 'a'
for i in range(len(df_comparison)):
    if df_comparison.loc[i,'y_test_nb']==0:
        df_comparison.loc[i,'subreddit'] = "r/TheOnion"
    elif df_comparison.loc[i,'y_test_nb']==1:
        df_comparison.loc[i,'subreddit'] = 'r/news'
        
df_comparison['title'] = df_comparison['title_x']
df_comparison = df_comparison.drop(columns=[
    'y_test_logreg','title_y','y_test_nb','title_x'
])        

In [17]:
# dataframe showing posts where BOTH Naive Bayes and Log Reg predicted WRONGLY. 
df_comparison[(df_comparison['nb_result']=='wrong') & (df_comparison['logreg_result']=='wrong')]

Unnamed: 0,index,y_pred_nb,nb_result,y_pred_logreg,logreg_result,subreddit,title
70,292,1,wrong,1,wrong,r/TheOnion,Software Indicates Missing Child Likely A Prostitute By Now
152,1866,1,wrong,1,wrong,r/TheOnion,Inclusive New Texas Bill Prevents Gun Sellers From Discriminating On Basis Of Background Check
164,721,1,wrong,1,wrong,r/TheOnion,Apple Introduces Eggplant Emoji Covered In Sores
181,4351,0,wrong,0,wrong,r/news,Charlie Kirk being asked about when 'we' get to use the guns.
211,3668,0,wrong,0,wrong,r/news,"Molly Ball, author of this article, is TIME's National Political Correspondent. Previously, she covered U.S. politics for The Atlantic and Politico, and worked for newspapers in Nevada and Cambodia. She is the author of Pelosi, a bestselling biography of the first woman Speaker of the House."
235,3621,1,wrong,1,wrong,r/TheOnion,T.J. Miller Arrested For Alleged Fake Bomb Threat
250,4479,1,wrong,1,wrong,r/TheOnion,"They Said What?!: Find Out What Wolfgang Puck, Surgeon General Jerome Adams, And Cynthia Nixon Have To Say"
356,28,1,wrong,1,wrong,r/TheOnion,North Dakota Found To Be Harboring Nuclear Missiles
500,312,1,wrong,1,wrong,r/TheOnion,Gunman Kills 15 Potential Swing Voters
523,3124,1,wrong,1,wrong,r/TheOnion,NYC Opens $500 Million Decoy Subway Station To Catch Turnstile Jumpers
