In [1]:
# Import packages
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn_pandas import DataFrameMapper

from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix, ConfusionMatrixDisplay, accuracy_score



# Reading the Data

In [2]:
train_data = pd.read_csv("../../Data/Final datasets/train_data.csv")
val_data = pd.read_csv("../../Data/Final datasets/val_data.csv")
test_data = pd.read_csv("../../Data/Final datasets/test_data.csv")

train_data.sample(5)

Unnamed: 0.1,Unnamed: 0,char_count,word_count,sentence_count,prop_unique_words,avg_sentence_length,prop_punctuations,prop_stopwords,prop_words_in_quotes,prop_nouns,prop_verbs,prop_adjectives,prop_discourse_relations,textblob_sentiment,text,class_label,text_preprocessed
26609,26609,0.666917,0.659978,0.559218,0.440509,0.58307,0.340252,0.841269,1.0,0.533927,0.618542,0.508305,0.282543,0.535695,Fellow Republicans rebuke Trump over governmen...,0,fellow republican rebuk trump govern shutdown ...
29157,29157,0.603781,0.610783,0.501643,0.526351,0.576269,0.387325,0.86149,1.0,0.507508,0.612994,0.507395,0.325321,0.57628,"Damien Chazelle, ‘La La Land’ Director, on Cal...",0,damien chazel la la land director california a...
11538,11538,0.563366,0.557473,0.42942,0.536317,0.5789,0.322225,0.835132,1.0,0.563106,0.603683,0.510901,0.259259,0.461483,Controversial Milwaukee County sheriff says ta...,0,controversi milwauke counti sheriff say take u...
19374,19374,0.715887,0.711323,0.629687,0.370864,0.579614,0.328084,0.862529,1.0,0.471117,0.600421,0.563037,0.235367,0.543917,Neuroscientist Says Fasting Reduces the Risk o...,1,neuroscientist say fast reduc risk brain disea...
4872,4872,0.610511,0.609927,0.472508,0.513272,0.604768,0.331432,0.837756,1.0,0.508367,0.589346,0.545674,0.271405,0.524322,TRUMP THREATENS TO SUE ILLEGAL IMMIGRANT ACTIV...,1,trump threaten sue illeg immigr activist fav o...


In [3]:
X_train_text = train_data["text_preprocessed"].values
y_train = train_data["class_label"].values

X_val_text = val_data["text_preprocessed"].values
y_val = val_data["class_label"].values

X_test_text = test_data["text_preprocessed"].values
y_test = test_data["class_label"].values

# Baseline Model (Decision Tree)
## Using CountVectoriser with Bag of Words, Unigrams + Bigrams

### Using 134 and 3k features

In [5]:
# with 134 features
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.15)
vectorizer.fit(X_train_text)

X_train = vectorizer.transform(X_train_text)
X_val = vectorizer.transform(X_val_text)
X_test = vectorizer.transform(X_test_text)

In [6]:
print("number of features used:", len(vectorizer.get_feature_names()))

number of features used: 134


In [7]:
# initialise Decision Tree
clf = DecisionTreeClassifier(criterion='gini', random_state=0) 

# train model
clf.fit(X_train, y_train)

DecisionTreeClassifier(random_state=0)

In [9]:
y_val_pred = clf.predict(X_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.90      0.89      0.90      6361
           1       0.90      0.91      0.90      6659

    accuracy                           0.90     13020
   macro avg       0.90      0.90      0.90     13020
weighted avg       0.90      0.90      0.90     13020



In [10]:
y_test_pred = clf.predict(X_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.91      0.90      0.90      6361
           1       0.91      0.91      0.91      6660

    accuracy                           0.91     13021
   macro avg       0.91      0.91      0.91     13021
weighted avg       0.91      0.91      0.91     13021



In [17]:
# from sklearn.tree import export_graphviz
# import pydotplus
# from six import StringIO  
# from IPython.display import Image

# feature_cols = ['text_preprocessed']

# dot_data = StringIO()
# export_graphviz(clf, out_file = dot_data, 
#                       feature_names = feature_cols,  
#                      filled = True, rounded = True,  
#                     special_characters = True)

# graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
# Image(graph.create_png())

In [5]:
# from sklearn.tree import plot_tree

# plot_tree(clf.fit(X_train, y_train))

In [8]:
#  with 3k features
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.01)
vectorizer.fit(X_train_text)

X_train = vectorizer.transform(X_train_text)
X_val = vectorizer.transform(X_val_text)
X_test = vectorizer.transform(X_test_text)

In [9]:
# train model
clf.fit(X_train, y_train)

DecisionTreeClassifier(random_state=0)

In [10]:
y_val_pred = clf.predict(X_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.95      0.94      0.95      6361
           1       0.95      0.95      0.95      6659

    accuracy                           0.95     13020
   macro avg       0.95      0.95      0.95     13020
weighted avg       0.95      0.95      0.95     13020



In [11]:
y_test_pred = clf.predict(X_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.95      0.94      0.95      6361
           1       0.95      0.95      0.95      6660

    accuracy                           0.95     13021
   macro avg       0.95      0.95      0.95     13021
weighted avg       0.95      0.95      0.95     13021



## Using Tf-ldf
### Feature Selection

In [11]:
# with 134 features
tfidf_params = {'unigram and bigram': (1,2)}

for ngram, values in tfidf_params.items():
    tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=values, min_df=0.15)
    tfidf_vectorizer.fit(X_train_text)

    X_train = tfidf_vectorizer.transform(X_train_text)
    X_val = tfidf_vectorizer.transform(X_val_text)
    X_test = tfidf_vectorizer.transform(X_test_text)

    print(f"Model with {ngram}")
    clf.fit(X_train, y_train)

    # Validation Data
    print("Testing using validation data:")    
    y_val_pred = clf.predict(X_val)
    print(classification_report(y_val, y_val_pred))
    print("------------------------------------------")

    # Test Data
    print("Testing using test data:")  
    y_test_pred = clf.predict(X_test)
    print(classification_report(y_test, y_test_pred))
    print("------------------------------------------")
    print("------------------------------------------")

Model with unigram and bigram
Testing using validation data:
              precision    recall  f1-score   support

           0       0.91      0.89      0.90      6361
           1       0.90      0.91      0.91      6659

    accuracy                           0.90     13020
   macro avg       0.90      0.90      0.90     13020
weighted avg       0.90      0.90      0.90     13020

------------------------------------------
Testing using test data:
              precision    recall  f1-score   support

           0       0.91      0.90      0.91      6361
           1       0.91      0.92      0.91      6660

    accuracy                           0.91     13021
   macro avg       0.91      0.91      0.91     13021
weighted avg       0.91      0.91      0.91     13021

------------------------------------------
------------------------------------------


In [12]:
tfidf_params = {'unigram and bigram': (1,2)}

for ngram, values in tfidf_params.items():
    tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=values, min_df=0.01)
    tfidf_vectorizer.fit(X_train_text)

    X_train = tfidf_vectorizer.transform(X_train_text)
    X_val = tfidf_vectorizer.transform(X_val_text)
    X_test = tfidf_vectorizer.transform(X_test_text)

    print(f"Model with {ngram}")
    clf.fit(X_train, y_train)

    # Validation Data
    print("Testing using validation data:")    
    y_val_pred = clf.predict(X_val)
    print(classification_report(y_val, y_val_pred))
    print("------------------------------------------")

    # Test Data
    print("Testing using test data:")
    y_test_pred = clf.predict(X_test)
    print(classification_report(y_test, y_test_pred))

Model with unigram and bigram
Testing using validation data:
              precision    recall  f1-score   support

           0       0.95      0.94      0.95      6361
           1       0.94      0.95      0.95      6659

    accuracy                           0.95     13020
   macro avg       0.95      0.95      0.95     13020
weighted avg       0.95      0.95      0.95     13020

------------------------------------------
Testing using test data:
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      6361
           1       0.94      0.95      0.95      6660

    accuracy                           0.94     13021
   macro avg       0.94      0.94      0.94     13021
weighted avg       0.94      0.94      0.94     13021



### Feature selection with all added features

In [18]:
train_data_added_features_scaled = pd.read_csv("../../Data/Final datasets/train_data.csv")
# val_data_added_features_scaled = pd.read_csv("../../Data/Final datasets/val_data.csv")
test_data_added_features_scaled = pd.read_csv("../../Data/Final datasets/test_data.csv")

In [16]:
y_train = train_data_added_features_scaled["class_label"].values
y_test = test_data_added_features_scaled["class_label"].values
# y_val = val_data_features["class_label"].values

In [19]:
# with 134 features
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.15)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'sentence_count', 'prop_unique_words',
    'avg_sentence_length', 'prop_punctuations', 'prop_stopwords',
    'prop_words_in_quotes', 'prop_nouns', 'prop_verbs', 'prop_adjectives',
    'prop_discourse_relations', 'textblob_sentiment'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

X_train_added_features = mapper.fit_transform(train_data_added_features_scaled)
X_test_added_features = mapper.transform(test_data_added_features_scaled)

clf.fit(X_train_added_features, y_train)

# Test Data
print("Testing using test data:")
y_test_pred = clf.predict(X_test_added_features)
print(classification_report(y_test, y_test_pred))

Testing using test data:
              precision    recall  f1-score   support

           0       0.88      0.90      0.89      6361
           1       0.90      0.88      0.89      6660

    accuracy                           0.89     13021
   macro avg       0.89      0.89      0.89     13021
weighted avg       0.89      0.89      0.89     13021



In [20]:
# with 3k features
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.01)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'sentence_count', 'prop_unique_words',
    'avg_sentence_length', 'prop_punctuations', 'prop_stopwords',
    'prop_words_in_quotes', 'prop_nouns', 'prop_verbs', 'prop_adjectives',
    'prop_discourse_relations', 'textblob_sentiment'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

X_train_added_features = mapper.fit_transform(train_data_added_features_scaled)
X_test_added_features = mapper.transform(test_data_added_features_scaled)

clf.fit(X_train_added_features, y_train)

# Test Data
print("Testing using test data:")
y_test_pred = clf.predict(X_test_added_features)
print(classification_report(y_test, y_test_pred))

Testing using test data:
              precision    recall  f1-score   support

           0       0.93      0.94      0.93      6361
           1       0.94      0.93      0.94      6660

    accuracy                           0.94     13021
   macro avg       0.93      0.94      0.94     13021
weighted avg       0.94      0.94      0.94     13021



### Feature selection with selected added features

In [21]:
# with 134 features
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.15)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'prop_unique_words', 'avg_sentence_length', 'prop_punctuations', 'prop_stopwords', 'prop_nouns'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

X_train_added_features = mapper.fit_transform(train_data_added_features_scaled)
X_test_added_features = mapper.transform(test_data_added_features_scaled)

clf.fit(X_train_added_features, y_train)

# Test Data
print("Testing using test data:")
y_test_pred = clf.predict(X_test_added_features)
print(classification_report(y_test, y_test_pred))

Testing using test data:
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      6361
           1       0.88      0.89      0.89      6660

    accuracy                           0.88     13021
   macro avg       0.88      0.88      0.88     13021
weighted avg       0.88      0.88      0.88     13021



In [22]:
# with 3k features
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.01)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'prop_unique_words', 'avg_sentence_length', 'prop_punctuations', 'prop_stopwords', 'prop_nouns'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

X_train_added_features = mapper.fit_transform(train_data_added_features_scaled)
X_test_added_features = mapper.transform(test_data_added_features_scaled)

clf.fit(X_train_added_features, y_train)

# Test Data
print("Testing using test data:")
y_test_pred = clf.predict(X_test_added_features)
print(classification_report(y_test, y_test_pred))

Testing using test data:
              precision    recall  f1-score   support

           0       0.93      0.94      0.93      6361
           1       0.94      0.93      0.94      6660

    accuracy                           0.94     13021
   macro avg       0.94      0.94      0.94     13021
weighted avg       0.94      0.94      0.94     13021

