In [None]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
totalData = pd.read_csv('https://raw.githubusercontent.com/indicium15/sc1015-project/main/totalData.csv')
totalData.drop(['Unnamed: 0'], inplace=True, axis=1)
print(totalData.shape)
totalData = totalData.dropna()
print(totalData.shape)

(44898, 2)
(44891, 2)


In [None]:
print(totalData['title'])

0        donald trump sends out embarrassing new years ...
1        drunk bragging trump staffer started russian c...
2        sheriff david clarke becomes an internet joke ...
3        trump is so obsessed he even has obamas name c...
4        pope francis just called out donald trump duri...
                               ...                        
44893    fully committed nato backs new us approach on ...
44894    lexisnexis withdrew two products from chinese ...
44895    minsk cultural hub becomes haven from authorities
44896    vatican upbeat on possibility of pope francis ...
44897    indonesia to buy 114 billion worth of russian ...
Name: title, Length: 44891, dtype: object


In [None]:
def lemmatize(text):
  lm = WordNetLemmatizer()
  tokens = [lm.lemmatize(word) for word in text.split()]
  return " ".join(tokens)


In [None]:
## need to lemmatize all the titles by using the lemmatize function
totalData['title'] = totalData['title'].apply(lemmatize)

In [None]:
y = totalData['class']
x = totalData.drop('class', axis=1)

In [None]:
print(x['title'])

0        donald trump sends out embarrassing new year e...
1        drunk bragging trump staffer started russian c...
2        sheriff david clarke becomes an internet joke ...
3        trump is so obsessed he even ha obamas name co...
4        pope francis just called out donald trump duri...
                               ...                        
44893    fully committed nato back new u approach on af...
44894    lexisnexis withdrew two product from chinese m...
44895      minsk cultural hub becomes haven from authority
44896    vatican upbeat on possibility of pope francis ...
44897    indonesia to buy 114 billion worth of russian jet
Name: title, Length: 44891, dtype: object


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=100)

In [None]:
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,3), lowercase=True, max_features=5000)
x_train_trans = tfidf.fit_transform(x_train['title'])
x_test_trans = tfidf.fit_transform(x_test['title'])
print(x_train)
print(x_train_trans)

                                                   title
2413   ivanka trump tweeted about religious tolerance...
29801            house vote to begin repealing obamacare
34531             trump skip debate win on social medium
43993  with trump meeting malaysia pm seek to put 1md...
93     texas sheriff threatens to jail truck owner fo...
...                                                  ...
24297  exdemocratic leader who mulled dropping clinto...
42173  only miracle can move brexit talk forward by o...
32245  trump demand u supreme court justice ginsburg ...
21567  the list of who who taking advantage of failed...
19274  gang of domestic terrorist violently attack lo...

[31423 rows x 1 columns]
  (0, 2199)	0.36372101265184437
  (0, 1846)	0.38314491498252995
  (0, 2040)	0.4230518149022874
  (0, 1766)	0.30449081085375324
  (0, 3495)	0.37616005325436547
  (0, 4641)	0.4230518149022874
  (0, 4456)	0.11006922139743076
  (0, 2198)	0.34151874009572075
  (1, 2009)	0.4995488627544455
  (1,

In [None]:
x_train_trans.shape

(31423, 5000)

In [None]:
rfc = RandomForestClassifier()
rfc.fit(x_train_trans, y_train)

RandomForestClassifier()

In [None]:
y_pred = rfc.predict(x_test_trans)

In [None]:
confusion_matrix(y_test, y_pred)

array([[5248, 1795],
       [4796, 1629]])

In [None]:
accuracy_score(y_test, y_pred)

0.5106177606177607