# Imports

In [79]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from gensim.models import Word2Vec
from gensim import corpora
from nltk.tokenize import WordPunctTokenizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

import re
import spacy


# Read and Clean Data

In [54]:
art_df = pd.read_csv('./db_df.csv')
ratings= pd.read_csv('./ratings_temp.csv')

In [55]:
art_df.dropna(inplace=True)
ratings.dropna(inplace=True)

In [56]:
art_list = list(art_df['content'].astype(str).to_numpy())
rating_list = list(ratings['article'].astype(str).to_numpy())

In [57]:
def list_clean(textlist):
    textlist = [t.lower() for t in textlist]
    return textlist

In [58]:
rating_clean = list_clean(rating_list) 
art_clean = list_clean(art_list)
text_in = rating_clean+art_clean

In [81]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(rating_clean)
y = ratings['negative']
X_train, X_test,y_train,y_test = train_test_split(X,y)

# Random Forest

In [85]:

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [86]:
rf.score(X_train,y_train)

0.9905660377358491

In [87]:
rf.score(X_test,y_test)

0.704225352112676

# KNN

In [82]:
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [83]:
knn.score(X_train,y_train)

0.8301886792452831

In [84]:
knn.score(X_test,y_test)

0.6056338028169014

# Naive Bayes

In [88]:
mnb = MultinomialNB()
mnb.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [89]:
mnb.score(X_train,y_train)

0.9575471698113207

In [90]:
mnb.score(X_test,y_test)

0.647887323943662

# Support Vector Classifier

In [92]:
sv = SVC(gamma='scale')
sv.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [93]:
sv.score(X_train,y_train)

0.9811320754716981

In [95]:
sv.score(X_test,y_test)

0.6619718309859155

In [None]:
y_proba = sv.predict_proba(X_test)
neg_proba = y_proba[:,1]

# Write Data to CSV

In [None]:
art_df['preds'] = y_pred

In [None]:
art_df['neg_proba'] = neg_proba

In [74]:
art_df=art_df[['content','publishedAt','source_name','subjects','preds','neg_proba']]

In [76]:
art_df.to_csv('./db_df_preds')