In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("tmp/training_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Ind,Content,Target
0,0,0,https://www.aninews.in/news/world/asia/india-h...,0
1,1,2,https://www.620ckrm.com/2021/01/06/fire-crews-...,0
2,2,13,https://www.thestar.com.my/news/nation/2021/01...,1
3,3,14,https://www.watoday.com.au/world/north-america...,0
4,4,16,https://magic1079.iheart.com/content/2021-01-0...,0


In [4]:
def get_text(row):
    tokens = row['Content'].split("/")
    return max(tokens, key=len).replace("-", " ")

df['text'] = df.apply(lambda row: get_text(row), axis=1)
df.head()

Unnamed: 0.1,Unnamed: 0,Ind,Content,Target,text
0,0,0,https://www.aninews.in/news/world/asia/india-h...,0,india hands over nrs 306 million for reconstru...
1,1,2,https://www.620ckrm.com/2021/01/06/fire-crews-...,0,fire crews trained to deal with grain entrapments
2,2,13,https://www.thestar.com.my/news/nation/2021/01...,1,speed up aid to flood victims in rural areas s...
3,3,14,https://www.watoday.com.au/world/north-america...,0,trump played with fire and american democracy ...
4,4,16,https://magic1079.iheart.com/content/2021-01-0...,0,2021 01 06 two police officers connected to br...


In [5]:
df['Target'].value_counts()

0    1235
1    1057
Name: Target, dtype: int64

In [11]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download("punkt")
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/kian/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
import string


def text_process(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    return " ".join(text)

df['text'] = df['text'].apply(text_process)
df.head()

Unnamed: 0.1,Unnamed: 0,Ind,Content,Target,text
0,0,0,https://www.aninews.in/news/world/asia/india-h...,0,india hands nrs 306 million reconstruction ear...
1,1,2,https://www.620ckrm.com/2021/01/06/fire-crews-...,0,fire crews trained deal grain entrapments
2,2,13,https://www.thestar.com.my/news/nation/2021/01...,1,speed aid flood victims rural areas says king
3,3,14,https://www.watoday.com.au/world/north-america...,0,trump played fire american democracy got burnt...
4,4,16,https://magic1079.iheart.com/content/2021-01-0...,0,2021 01 06 two police officers connected breon...


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df['text'])
vectors.shape

(2292, 6132)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(vectors, df['Target'], test_size=0.15, random_state=666)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier(n_neighbors=49)
mnb = MultinomialNB(alpha=0.2)
dtc = DecisionTreeClassifier(min_samples_split=7, random_state=111)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=31, random_state=111)

clfs = {'SVC' : svc,'KN' : knc, 'NB': mnb, 'DT': dtc, 'LR': lrc, 'RF': rfc}

In [10]:
pred_scores = []
for clf_name, clf in clfs.items():
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    pred_scores.append((clf_name, [accuracy_score(y_test , pred)]))
pred_scores

[('SVC', [0.813953488372093]),
 ('KN', [0.7267441860465116]),
 ('NB', [0.7674418604651163]),
 ('DT', [0.813953488372093]),
 ('LR', [0.811046511627907]),
 ('RF', [0.8401162790697675])]

In [11]:
import pickle

best_classifier = clfs["RF"]
with open('clf.pkl', 'wb') as f:
    pickle.dump(best_classifier, f)

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)