In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anshuman.kundu1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df = pd.read_csv('./data/Processed.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  44898 non-null  int64 
 1   label       44898 non-null  int64 
 2   content     44898 non-null  object
dtypes: int64(2), object(1)
memory usage: 1.0+ MB


In [4]:
df = df.drop(columns=['Unnamed: 0'], axis = 1)

In [5]:
df.head()

Unnamed: 0,label,content
0,0,Bill Maher: Trump’s Orlando Massacre Response...
1,0,THE YOUNG GIRL THE CLINTONS DESTROYED…Monica L...
2,1,"Iran, Saudi Arabia to exchange diplomatic visi..."
3,0,Trump Just Got Dealt A STINGING Blow From His...
4,0,Governor Of Texas Unveils Detailed Plan To OV...


In [6]:
ps = PorterStemmer()

In [7]:
def stemming(news):
    content = re.sub('[^a-zA-Z]', ' ', news)
    content = content.lower()
    content = content.split()
    content = [ps.stem(word) for word in content if word not in stopwords.words('English')]
    content = ' '.join(content)
    return content

In [8]:
df['content'] = df['content'].apply(stemming)

In [9]:
df.head()

Unnamed: 0,label,content
0,0,bill maher trump orlando massacr respons show ...
1,0,young girl clinton destroy monica lewinski pro...
2,1,iran saudi arabia exchang diplomat visit irani...
3,0,trump got dealt sting blow son law newspap news
4,0,governor texa unveil detail plan overthrow fed...


In [10]:
X = df['content'].values
y = df['label'].values

In [11]:
print(X)

['bill maher trump orlando massacr respons show america narcissist video news'
 'young girl clinton destroy monica lewinski probabl year old want polit'
 'iran saudi arabia exchang diplomat visit iranian foreign minist worldnew'
 ... 'hous republican pursu short term govern fund bill politicsnew'
 'white nationalist radio trump gave us press pass interview son news'
 'media immedi report alleg killer imam assist cathol take day id religion muslim commit act terror us video polit']


In [12]:
print(y)

[0 0 1 ... 1 0 0]


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=14000)
X = cv.fit_transform(X).toarray()

In [14]:
X.shape

(44898, 13208)

In [15]:
y.shape

(44898,)

In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

models = {
    "Decision Tree" : DecisionTreeClassifier(),
    "Logistic Regression" : LogisticRegression()            
}

In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    report = classification_report(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)
    evaluation_metrics = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': conf_matrix,
        'Classification Report': report
    }

    return accuracy


In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
model_list = []
accuracy_list = []
for i in range(len(list(models))):
    model = list(models.values())[i]
    model_name = list(models.keys())[i]
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    accuracy = evaluate_model(y_test, y_test_pred)
    accuracy_list.append(accuracy)
    model_list.append(model_name)

In [20]:
Evaluation_df = pd.DataFrame({"Model":model_list, "Accuracy":accuracy_list})

In [22]:
Evaluation_df

Unnamed: 0,Model,Accuracy
0,Decision Tree,1.0
1,Logistic Regression,1.0


In [23]:
best_model = DecisionTreeClassifier()
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of the model is ", accuracy)

Accuracy of the model is  1.0


In [None]:
import matplotlib.pyplot as plt
plt.scatter(y_pred, y_test)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.show();

In [None]:
import pickle

model_file = '../artifacts/model.pkl'

with open(model_file, 'wb') as file:
    pickle.dump(best_model, file)