In [1]:
# Import packages
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix, ConfusionMatrixDisplay, accuracy_score

# Reading the Data

In [2]:
train_data = pd.read_csv("../../Data/Combined data/train_data.csv")
val_data = pd.read_csv("../../Data/Combined data/validation_data.csv")
test_data = pd.read_csv("../../Data/Combined data/test_data.csv")

train_data.sample(5)

Unnamed: 0.1,Unnamed: 0,text,class_label,text_preprocessed
7171,3719,China Sees New Ambiguity With Donald Trump’s T...,0,china see new ambigu donald trump taiwan call ...
19996,18613,"Polish cut in retirement age comes into force,...",0,polish cut retir age come forc buck european t...
30542,10928,FORMER FBI ASST DIRECTOR: “Jim Comey ‘Danced W...,1,former fbi asst director jim comey danc devil…...
31671,10339,KID ROCK’S INTERVIEW With Piers Morgan Sheds L...,1,kid rock interview pier morgan shed light demo...
5225,7261,This Christian Minister Just Gave A 4 Minute ...,1,christian minist gave 4 minut speech discrimin...


In [3]:
X_train_text = train_data["text_preprocessed"].values
y_train = train_data["class_label"].values

X_val_text = val_data["text_preprocessed"].values
y_val = val_data["class_label"].values

X_test_text = test_data["text_preprocessed"].values
y_test = test_data["class_label"].values

# Baseline Model (Decision Tree)
## Using CountVectoriser with Bag of Words, Unigrams

In [4]:
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,1))
vectorizer.fit(X_train_text)

X_train = vectorizer.transform(X_train_text)
X_val = vectorizer.transform(X_val_text)
X_test = vectorizer.transform(X_test_text)

In [6]:
print("number of features used:", len(vectorizer.get_feature_names()))

number of features used: 238266


In [7]:
# Sparse vector of frequency of each word appearing in a text article
print(X_train)

  (0, 516)	1
  (0, 1922)	1
  (0, 2029)	1
  (0, 4715)	1
  (0, 5598)	1
  (0, 6839)	1
  (0, 7282)	1
  (0, 7488)	1
  (0, 7630)	1
  (0, 7783)	1
  (0, 8151)	1
  (0, 8253)	1
  (0, 9085)	1
  (0, 9332)	1
  (0, 9881)	1
  (0, 11762)	1
  (0, 14195)	1
  (0, 14350)	1
  (0, 14626)	1
  (0, 15044)	1
  (0, 15420)	1
  (0, 16070)	1
  (0, 16649)	1
  (0, 16853)	2
  (0, 16959)	1
  :	:
  (39059, 207643)	1
  (39059, 207675)	1
  (39059, 207722)	8
  (39059, 209919)	1
  (39059, 211547)	1
  (39059, 212398)	1
  (39059, 213163)	1
  (39059, 213646)	1
  (39059, 213724)	3
  (39059, 213905)	1
  (39059, 214143)	1
  (39059, 214295)	1
  (39059, 214663)	2
  (39059, 216315)	1
  (39059, 217267)	1
  (39059, 217354)	1
  (39059, 217846)	1
  (39059, 217891)	1
  (39059, 218513)	1
  (39059, 218862)	1
  (39059, 218944)	1
  (39059, 219049)	1
  (39059, 219173)	1
  (39059, 219210)	1
  (39059, 220668)	1


In [5]:
dt = DecisionTreeClassifier()
pipe = Pipeline(steps=[('dt', dt)])
criterion = ['gini', 'entropy']
max_depth = [12, 14, 16, 18, 20, 22]
parameters = dict(dt__criterion = criterion, dt__max_depth = max_depth)

# initialise Decision Tree
# clf = DecisionTreeClassifier(criterion='entropy',random_state=0)
clf = GridSearchCV(pipe, parameters) 

# train model
clf.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('dt', DecisionTreeClassifier())]),
             param_grid={'dt__criterion': ['gini', 'entropy'],
                         'dt__max_depth': [12, 14, 16, 18, 20, 22]})

In [6]:
y_val_pred = clf.predict(X_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96      6361
           1       0.96      0.97      0.96      6659

    accuracy                           0.96     13020
   macro avg       0.96      0.96      0.96     13020
weighted avg       0.96      0.96      0.96     13020



In [7]:
y_test_pred = clf.predict(X_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.97      0.96      0.96      6361
           1       0.96      0.97      0.96      6660

    accuracy                           0.96     13021
   macro avg       0.96      0.96      0.96     13021
weighted avg       0.96      0.96      0.96     13021



In [8]:
# from sklearn.tree import export_graphviz
# import pydotplus
# from six import StringIO  
# from IPython.display import Image

# feature_cols = ['text_preprocessed']

# dot_data = StringIO()
# export_graphviz(clf, out_file = dot_data, 
#                       feature_names = feature_cols,  
#                      filled = True, rounded = True,  
#                     special_characters = True)

# graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
# Image(graph.create_png())

In [9]:
# from sklearn.tree import plot_tree

# plot_tree(clf.fit(X_train, y_train))

## Using CountVectorizer with Bag of Word, Unigrams + Bigrams

In [13]:
vectorizer2 = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.01)
vectorizer2.fit(X_train_text)

X_train = vectorizer2.transform(X_train_text)
X_val = vectorizer2.transform(X_val_text)
X_test = vectorizer2.transform(X_test_text)

In [14]:
# train model
clf.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('dt', DecisionTreeClassifier())]),
             param_grid={'dt__criterion': ['gini', 'entropy'],
                         'dt__max_depth': [12, 14, 16, 18, 20, 22]})

In [15]:
y_val_pred = clf.predict(X_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.96      0.95      0.96      6361
           1       0.95      0.97      0.96      6659

    accuracy                           0.96     13020
   macro avg       0.96      0.96      0.96     13020
weighted avg       0.96      0.96      0.96     13020



In [16]:
y_test_pred = clf.predict(X_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.96      0.95      0.95      6361
           1       0.95      0.96      0.96      6660

    accuracy                           0.95     13021
   macro avg       0.95      0.95      0.95     13021
weighted avg       0.95      0.95      0.95     13021



## Using CountVectorizer with Bag of Word, Bigrams only

In [14]:
vectorizer3 = CountVectorizer(stop_words='english', ngram_range=(2,2))
vectorizer3.fit(X_train_text)

X_train = vectorizer3.transform(X_train_text)
X_val = vectorizer3.transform(X_val_text)
X_test = vectorizer3.transform(X_test_text)

In [15]:
# train model
clf.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

In [16]:
y_val_pred = clf.predict(X_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.92      0.91      0.91      6361
           1       0.91      0.92      0.92      6659

    accuracy                           0.92     13020
   macro avg       0.92      0.92      0.92     13020
weighted avg       0.92      0.92      0.92     13020



In [17]:
y_test_pred = clf.predict(X_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.92      0.91      0.91      6361
           1       0.92      0.92      0.92      6660

    accuracy                           0.92     13021
   macro avg       0.92      0.92      0.92     13021
weighted avg       0.92      0.92      0.92     13021



## Using Tf-ldf and Unigrams

In [18]:
tfidf_params = {'unigram':(1,1), 'unigram and bigram': (1,2), 'bigram':(2,2)}

for ngram, values in tfidf_params.items():
    tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=values)
    tfidf_vectorizer.fit(X_train_text)

    X_train = tfidf_vectorizer.transform(X_train_text)
    X_val = tfidf_vectorizer.transform(X_val_text)
    X_test = tfidf_vectorizer.transform(X_test_text)

    print(f"Model with {ngram}")
    clf.fit(X_train, y_train)

    # Validation Data
    print("Testing using validation data:")    
    y_val_pred = clf.predict(X_val)
    print(classification_report(y_val, y_val_pred))
    print("------------------------------------------")

    # Test Data
    print("Testing using test data:")
    y_test_pred = clf.predict(X_test)
    print(classification_report(y_test, y_test_pred))
    print("------------------------------------------")
    print("------------------------------------------")

Model with unigram
Testing using validation data:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95      6361
           1       0.95      0.96      0.95      6659

    accuracy                           0.95     13020
   macro avg       0.95      0.95      0.95     13020
weighted avg       0.95      0.95      0.95     13020

------------------------------------------
Testing using test data:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95      6361
           1       0.96      0.96      0.96      6660

    accuracy                           0.96     13021
   macro avg       0.96      0.96      0.96     13021
weighted avg       0.96      0.96      0.96     13021

------------------------------------------
------------------------------------------
Model with unigram and bigram
Testing using validation data:
              precision    recall  f1-score   support

           0       0.95   

In [17]:
# with 3k features
tfidf_params = {'unigram and bigram': (1,2)}

for ngram, values in tfidf_params.items():
    tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=values, min_df = 0.01)
    tfidf_vectorizer.fit(X_train_text)

    X_train = tfidf_vectorizer.transform(X_train_text)
    X_val = tfidf_vectorizer.transform(X_val_text)
    X_test = tfidf_vectorizer.transform(X_test_text)

    print(f"Model with {ngram}")
    clf.fit(X_train, y_train)

    # Validation Data
    print("Testing using validation data:")    
    y_val_pred = clf.predict(X_val)
    print(classification_report(y_val, y_val_pred))
    print("------------------------------------------")

    # Test Data
    print("Testing using test data:")
    y_test_pred = clf.predict(X_test)
    print(classification_report(y_test, y_test_pred))
    print("------------------------------------------")
    print("------------------------------------------")

Model with unigram and bigram
Testing using validation data:
              precision    recall  f1-score   support

           0       0.97      0.94      0.95      6361
           1       0.95      0.97      0.96      6659

    accuracy                           0.96     13020
   macro avg       0.96      0.96      0.96     13020
weighted avg       0.96      0.96      0.96     13020

------------------------------------------
Testing using test data:
              precision    recall  f1-score   support

           0       0.96      0.94      0.95      6361
           1       0.95      0.96      0.96      6660

    accuracy                           0.95     13021
   macro avg       0.95      0.95      0.95     13021
weighted avg       0.95      0.95      0.95     13021

------------------------------------------
------------------------------------------


In [8]:
print('Best Criterion', clf.best_estimator_.get_params()['dt__criterion'])
print('Best max_depth', clf.best_estimator_.get_params(['dt__max_depth']))
print(); print(clf.best_estimator_.get_params()['dt'])

Best Criterion gini
Best max_depth {'memory': None, 'steps': [('dt', DecisionTreeClassifier(max_depth=22))], 'verbose': False, 'dt': DecisionTreeClassifier(max_depth=22), 'dt__ccp_alpha': 0.0, 'dt__class_weight': None, 'dt__criterion': 'gini', 'dt__max_depth': 22, 'dt__max_features': None, 'dt__max_leaf_nodes': None, 'dt__min_impurity_decrease': 0.0, 'dt__min_impurity_split': None, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 2, 'dt__min_weight_fraction_leaf': 0.0, 'dt__random_state': None, 'dt__splitter': 'best'}

DecisionTreeClassifier(max_depth=22)
