In [7]:
# Import packages
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import Pipeline

from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix, ConfusionMatrixDisplay, accuracy_score

# Reading the Data

In [2]:
train_data = pd.read_csv("../../Data/Final datasets/train_data.csv")
val_data = pd.read_csv("../../Data/Final datasets/val_data.csv")
test_data = pd.read_csv("../../Data/Final datasets/test_data.csv")

train_data.sample(5)

Unnamed: 0.1,Unnamed: 0,char_count,word_count,sentence_count,prop_unique_words,avg_sentence_length,prop_punctuations,prop_stopwords,prop_words_in_quotes,prop_nouns,prop_verbs,prop_adjectives,prop_discourse_relations,textblob_sentiment,text,class_label,text_preprocessed
7329,7329,0.52568,0.519298,0.305247,0.553054,0.654484,0.322593,0.856301,1.0,0.45276,0.566128,0.620464,0.185606,0.494083,Soy should not be consumed in significant quan...,1,soy consum signific quantiti unless ferment tr...
37228,37228,0.430449,0.435444,0.223589,0.818618,0.626127,0.389969,0.833284,1.0,0.530901,0.572455,0.506583,0.116667,0.500781,Democrat Mayor Proclaims He’s Barring Trump Fr...,1,democrat mayor proclaim he bar trump enter st ...
34563,34563,0.58452,0.588909,0.501643,0.41143,0.547206,0.405266,0.869145,1.0,0.460528,0.644164,0.479209,0.275591,0.557891,Trump Just Accidentally Confirmed Russian Con...,1,trump accident confirm russian contact session...
11993,11993,0.576491,0.576252,0.489896,0.565649,0.542336,0.421679,0.829616,1.0,0.508338,0.581813,0.523673,0.325942,0.544597,WATCH: Angry White Actors Have Silent Meltdow...,1,watch angri white actor silent meltdown meryl ...
20643,20643,0.582368,0.586367,0.422957,0.549625,0.623866,0.288776,0.853268,1.0,0.486701,0.578983,0.574013,0.211694,0.541721,Trump’s Barely Literate Note Left At Israel H...,1,trump bare liter note left israel holocaust me...


In [3]:
X_train_text = train_data["text_preprocessed"].values
y_train = train_data["class_label"].values

X_val_text = val_data["text_preprocessed"].values
y_val = val_data["class_label"].values

X_test_text = test_data["text_preprocessed"].values
y_test = test_data["class_label"].values

# Baseline Model (Decision Tree)

In [8]:
# fine tuning parameters
dt = DecisionTreeClassifier()
pipe = Pipeline(steps=[('dt', dt)])
criterion = ['gini', 'entropy']
max_depth = [12, 14, 16, 18, 20, 22]
parameters = dict(dt__criterion = criterion, dt__max_depth = max_depth)

# initialise Decision Tree
# clf = DecisionTreeClassifier(criterion='entropy',random_state=0)
clf = GridSearchCV(pipe, parameters) 

## Using Tf-ldf
### Feature Selection

In [9]:
# with 134 features
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.15)
tfidf_vectorizer.fit(X_train_text)

X_train = tfidf_vectorizer.transform(X_train_text)
X_val = tfidf_vectorizer.transform(X_val_text)
X_test = tfidf_vectorizer.transform(X_test_text)

clf.fit(X_train, y_train)

# Validation Data
print("Testing using validation data:")    
y_val_pred = clf.predict(X_val)
print(classification_report(y_val, y_val_pred))
print("------------------------------------------")

# Test Data
print("Testing using test data:")  
y_test_pred = clf.predict(X_test)
print(classification_report(y_test, y_test_pred))
print("------------------------------------------")

print("Best Parameters:", clf.best_params_)

Testing using validation data:
              precision    recall  f1-score   support

           0       0.92      0.90      0.91      6361
           1       0.91      0.92      0.91      6659

    accuracy                           0.91     13020
   macro avg       0.91      0.91      0.91     13020
weighted avg       0.91      0.91      0.91     13020

------------------------------------------
Testing using test data:
              precision    recall  f1-score   support

           0       0.92      0.91      0.92      6361
           1       0.91      0.93      0.92      6660

    accuracy                           0.92     13021
   macro avg       0.92      0.92      0.92     13021
weighted avg       0.92      0.92      0.92     13021

------------------------------------------
Best Parameters: {'dt__criterion': 'gini', 'dt__max_depth': 12}


In [10]:
# with 3k features
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.01)
tfidf_vectorizer.fit(X_train_text)

X_train = tfidf_vectorizer.transform(X_train_text)
X_val = tfidf_vectorizer.transform(X_val_text)
X_test = tfidf_vectorizer.transform(X_test_text)

clf.fit(X_train, y_train)

# Validation Data
print("Testing using validation data:")    
y_val_pred = clf.predict(X_val)
print(classification_report(y_val, y_val_pred))
print("------------------------------------------")

# Test Data
print("Testing using test data:")
y_test_pred = clf.predict(X_test)
print(classification_report(y_test, y_test_pred))
print("------------------------------------------")

print("Best Parameters:", clf.best_params_)

Testing using validation data:
              precision    recall  f1-score   support

           0       0.97      0.94      0.95      6361
           1       0.95      0.97      0.96      6659

    accuracy                           0.95     13020
   macro avg       0.96      0.95      0.95     13020
weighted avg       0.96      0.95      0.95     13020

------------------------------------------
Testing using test data:
              precision    recall  f1-score   support

           0       0.96      0.94      0.95      6361
           1       0.95      0.96      0.95      6660

    accuracy                           0.95     13021
   macro avg       0.95      0.95      0.95     13021
weighted avg       0.95      0.95      0.95     13021

------------------------------------------
Best Parameters: {'dt__criterion': 'gini', 'dt__max_depth': 12}


### Feature selection with all added features

In [11]:
train_data_added_features_scaled = pd.read_csv("../../Data/Final datasets/train_data.csv")
# val_data_added_features_scaled = pd.read_csv("../../Data/Final datasets/val_data.csv")
test_data_added_features_scaled = pd.read_csv("../../Data/Final datasets/test_data.csv")

In [12]:
y_train = train_data_added_features_scaled["class_label"].values
y_test = test_data_added_features_scaled["class_label"].values
# y_val = val_data_features["class_label"].values

In [13]:
# with 134 features
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.15)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'sentence_count', 'prop_unique_words',
    'avg_sentence_length', 'prop_punctuations', 'prop_stopwords',
    'prop_words_in_quotes', 'prop_nouns', 'prop_verbs', 'prop_adjectives',
    'prop_discourse_relations', 'textblob_sentiment'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

X_train_added_features = mapper.fit_transform(train_data_added_features_scaled)
X_test_added_features = mapper.transform(test_data_added_features_scaled)

clf.fit(X_train_added_features, y_train)

# Test Data
print("Testing using test data:")
y_test_pred = clf.predict(X_test_added_features)
print(classification_report(y_test, y_test_pred))
print("------------------------------------------")

print("Best Parameters:", clf.best_params_)

Testing using test data:
              precision    recall  f1-score   support

           0       0.90      0.91      0.91      6361
           1       0.91      0.91      0.91      6660

    accuracy                           0.91     13021
   macro avg       0.91      0.91      0.91     13021
weighted avg       0.91      0.91      0.91     13021

------------------------------------------
Best Parameters: {'dt__criterion': 'gini', 'dt__max_depth': 14}


In [14]:
# with 3k features
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.01)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'sentence_count', 'prop_unique_words',
    'avg_sentence_length', 'prop_punctuations', 'prop_stopwords',
    'prop_words_in_quotes', 'prop_nouns', 'prop_verbs', 'prop_adjectives',
    'prop_discourse_relations', 'textblob_sentiment'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

X_train_added_features = mapper.fit_transform(train_data_added_features_scaled)
X_test_added_features = mapper.transform(test_data_added_features_scaled)

clf.fit(X_train_added_features, y_train)

# Test Data
print("Testing using test data:")
y_test_pred = clf.predict(X_test_added_features)
print(classification_report(y_test, y_test_pred))
print("------------------------------------------")

print("Best Parameters:", clf.best_params_)

Testing using test data:
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      6361
           1       0.95      0.95      0.95      6660

    accuracy                           0.95     13021
   macro avg       0.95      0.95      0.95     13021
weighted avg       0.95      0.95      0.95     13021

------------------------------------------
Best Parameters: {'dt__criterion': 'gini', 'dt__max_depth': 12}


### Feature selection with selected added features

In [17]:
# with 134 features
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.15)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'prop_unique_words', 'avg_sentence_length', 'prop_punctuations', 'prop_stopwords', 'prop_nouns'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

X_train_added_features = mapper.fit_transform(train_data_added_features_scaled)
X_test_added_features = mapper.transform(test_data_added_features_scaled)

clf.fit(X_train_added_features, y_train)

# Test Data
print("Testing using test data:")
y_test_pred = clf.predict(X_test_added_features)
print(classification_report(y_test, y_test_pred))
print("------------------------------------------")

print("Best Parameters:", clf.best_params_)

Testing using test data:
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      6361
           1       0.90      0.90      0.90      6660

    accuracy                           0.90     13021
   macro avg       0.90      0.90      0.90     13021
weighted avg       0.90      0.90      0.90     13021

------------------------------------------
Best Parameters: {'dt__criterion': 'gini', 'dt__max_depth': 12}


In [18]:
# with 3k features
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.01)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'prop_unique_words', 'avg_sentence_length', 'prop_punctuations', 'prop_stopwords', 'prop_nouns'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

X_train_added_features = mapper.fit_transform(train_data_added_features_scaled)
X_test_added_features = mapper.transform(test_data_added_features_scaled)

clf.fit(X_train_added_features, y_train)

# Test Data
print("Testing using test data:")
y_test_pred = clf.predict(X_test_added_features)
print(classification_report(y_test, y_test_pred))
print("------------------------------------------")

print("Best Parameters:", clf.best_params_)

Testing using test data:
              precision    recall  f1-score   support

           0       0.95      0.94      0.94      6361
           1       0.95      0.95      0.95      6660

    accuracy                           0.95     13021
   macro avg       0.95      0.95      0.95     13021
weighted avg       0.95      0.95      0.95     13021

------------------------------------------
Best Parameters: {'dt__criterion': 'gini', 'dt__max_depth': 12}
