In [1]:
# Import packages
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn_pandas import DataFrameMapper

from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix, ConfusionMatrixDisplay, accuracy_score



# Reading the Data

In [2]:
train_data = pd.read_csv("../../Data/Final datasets/train_data.csv")
val_data = pd.read_csv("../../Data/Final datasets/val_data.csv")
test_data = pd.read_csv("../../Data/Final datasets/test_data.csv")

train_data.sample(5)

Unnamed: 0.1,Unnamed: 0,char_count,word_count,sentence_count,prop_unique_words,avg_sentence_length,prop_punctuations,prop_stopwords,prop_words_in_quotes,prop_nouns,prop_verbs,prop_adjectives,prop_discourse_relations,textblob_sentiment,text,class_label,text_preprocessed
9049,9049,0.548737,0.550496,0.345213,0.54695,0.655285,0.262236,0.859428,1.0,0.474574,0.631604,0.509785,0.158192,0.595933,Secret Trump University Papers Reveal How Bus...,1,secret trump univers paper reveal busi pump st...
24109,24109,0.527468,0.524026,0.376213,0.622294,0.58858,0.369281,0.833448,1.0,0.526127,0.592322,0.583513,0.278986,0.496429,"Biden, not mentioning Trump, defends free pres...",0,biden mention trump defend free press independ...
4238,4238,0.570914,0.579503,0.441508,0.514673,0.595876,0.326112,0.863752,1.0,0.419022,0.622103,0.477901,0.270968,0.527721,Rachel Maddow Reveals Her Secret To Successfu...,1,rachel maddow reveal secret success cover trum...
37938,37938,0.687854,0.685265,0.592188,0.413089,0.583133,0.347687,0.842255,1.0,0.492237,0.62814,0.508042,0.217184,0.529513,U.N. Admits Role in Cholera Epidemic in Haiti ...,0,un admit role cholera epidem haiti new york ti...
22858,22858,0.170669,0.171168,0.0,1.0,0.502413,0.308494,0.738554,1.0,0.63469,0.526576,1.0,0.0,0.5,Automation: Robots from Korea to America Are R...,1,autom robot korea america replac worker


In [3]:
X_train_text = train_data["text_preprocessed"].values
y_train = train_data["class_label"].values

X_val_text = val_data["text_preprocessed"].values
y_val = val_data["class_label"].values

X_test_text = test_data["text_preprocessed"].values
y_test = test_data["class_label"].values

# Baseline Model (Random Forest)

In [13]:
# # fine tuning parameters
# # max_depth = [2, 8, 14, 20, 28, 34]
# # n_estimators = [100, 200, 300, 400, 500]
# # param_grid = dict(max_depth=max_depth, n_estimators=n_estimators)
# n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
# max_features = ['auto', 'sqrt']
# max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# bootstrap = [True, False]

# # Create the random grid
# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'bootstrap': bootstrap}

# # Base model to tune
# rf = RandomForestClassifier()

# clf = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, random_state = 0)

In [5]:
max_depth = [2, 8, 14, 20, 28]
n_estimators = [100, 150]
max_features = ['auto', 'sqrt']
bootstrap = [True, False]
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators, max_features=max_features,bootstrap=bootstrap)

# building grid search
rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, max_features=max_features, bootstrap=bootstrap)
clf = GridSearchCV(estimator=rf, param_grid=param_grid, cv = 5)  

In [None]:
# from dask.distributed import Client
# from dask_saturn import SaturnCluster

# cluster = SaturnCluster()
# client = Client(cluster)

## Using Tf-ldf
### Feature Selection

In [6]:
# with 134 features
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.15)
tfidf_vectorizer.fit(X_train_text)

X_train = tfidf_vectorizer.transform(X_train_text)
X_val = tfidf_vectorizer.transform(X_val_text)
X_test = tfidf_vectorizer.transform(X_test_text)

clf.fit(X_train, y_train)

# Validation Data
print("Testing using validation data:")    
y_val_pred = clf.predict(X_val)
print(classification_report(y_val, y_val_pred))
print("------------------------------------------")

# Test Data
print("Testing using test data:")  
y_test_pred = clf.predict(X_test)
print(classification_report(y_test, y_test_pred))
print("------------------------------------------")

print("Best Parameters:", clf.best_params_)

KeyboardInterrupt: 

In [None]:
# with 3k features
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.01)
tfidf_vectorizer.fit(X_train_text)

X_train = tfidf_vectorizer.transform(X_train_text)
X_val = tfidf_vectorizer.transform(X_val_text)
X_test = tfidf_vectorizer.transform(X_test_text)

clf.fit(X_train, y_train)

# Validation Data
print("Testing using validation data:")    
y_val_pred = clf.predict(X_val)
print(classification_report(y_val, y_val_pred))
print("------------------------------------------")

# Test Data
print("Testing using test data:")
y_test_pred = clf.predict(X_test)
print(classification_report(y_test, y_test_pred))
print("------------------------------------------")

print("Best Parameters:", clf.best_params_)

### Feature selection with all added features

In [11]:
train_data_added_features_scaled = pd.read_csv("../../Data/Final datasets/train_data.csv")
# val_data_added_features_scaled = pd.read_csv("../../Data/Final datasets/val_data.csv")
test_data_added_features_scaled = pd.read_csv("../../Data/Final datasets/test_data.csv")

In [12]:
y_train = train_data_added_features_scaled["class_label"].values
y_test = test_data_added_features_scaled["class_label"].values
# y_val = val_data_features["class_label"].values

In [13]:
# with 134 features
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.15)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'sentence_count', 'prop_unique_words',
    'avg_sentence_length', 'prop_punctuations', 'prop_stopwords',
    'prop_words_in_quotes', 'prop_nouns', 'prop_verbs', 'prop_adjectives',
    'prop_discourse_relations', 'textblob_sentiment'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

X_train_added_features = mapper.fit_transform(train_data_added_features_scaled)
X_test_added_features = mapper.transform(test_data_added_features_scaled)

clf.fit(X_train_added_features, y_train)

# Test Data
print("Testing using test data:")
y_test_pred = clf.predict(X_test_added_features)
print(classification_report(y_test, y_test_pred))
print("------------------------------------------")

print("Best Parameters:", clf.best_params_)

Testing using test data:
              precision    recall  f1-score   support

           0       0.90      0.91      0.91      6361
           1       0.91      0.91      0.91      6660

    accuracy                           0.91     13021
   macro avg       0.91      0.91      0.91     13021
weighted avg       0.91      0.91      0.91     13021

------------------------------------------
Best Parameters: {'dt__criterion': 'gini', 'dt__max_depth': 14}


In [14]:
# with 3k features
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.01)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'sentence_count', 'prop_unique_words',
    'avg_sentence_length', 'prop_punctuations', 'prop_stopwords',
    'prop_words_in_quotes', 'prop_nouns', 'prop_verbs', 'prop_adjectives',
    'prop_discourse_relations', 'textblob_sentiment'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

X_train_added_features = mapper.fit_transform(train_data_added_features_scaled)
X_test_added_features = mapper.transform(test_data_added_features_scaled)

clf.fit(X_train_added_features, y_train)

# Test Data
print("Testing using test data:")
y_test_pred = clf.predict(X_test_added_features)
print(classification_report(y_test, y_test_pred))
print("------------------------------------------")

print("Best Parameters:", clf.best_params_)

Testing using test data:
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      6361
           1       0.95      0.95      0.95      6660

    accuracy                           0.95     13021
   macro avg       0.95      0.95      0.95     13021
weighted avg       0.95      0.95      0.95     13021

------------------------------------------
Best Parameters: {'dt__criterion': 'gini', 'dt__max_depth': 12}


### Feature selection with selected added features

In [17]:
# with 134 features
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.15)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'prop_unique_words', 'avg_sentence_length', 'prop_punctuations', 'prop_stopwords', 'prop_nouns'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

X_train_added_features = mapper.fit_transform(train_data_added_features_scaled)
X_test_added_features = mapper.transform(test_data_added_features_scaled)

clf.fit(X_train_added_features, y_train)

# Test Data
print("Testing using test data:")
y_test_pred = clf.predict(X_test_added_features)
print(classification_report(y_test, y_test_pred))
print("------------------------------------------")

print("Best Parameters:", clf.best_params_)

Testing using test data:
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      6361
           1       0.90      0.90      0.90      6660

    accuracy                           0.90     13021
   macro avg       0.90      0.90      0.90     13021
weighted avg       0.90      0.90      0.90     13021

------------------------------------------
Best Parameters: {'dt__criterion': 'gini', 'dt__max_depth': 12}


In [18]:
# with 3k features
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.01)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'prop_unique_words', 'avg_sentence_length', 'prop_punctuations', 'prop_stopwords', 'prop_nouns'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

X_train_added_features = mapper.fit_transform(train_data_added_features_scaled)
X_test_added_features = mapper.transform(test_data_added_features_scaled)

clf.fit(X_train_added_features, y_train)

# Test Data
print("Testing using test data:")
y_test_pred = clf.predict(X_test_added_features)
print(classification_report(y_test, y_test_pred))
print("------------------------------------------")

print("Best Parameters:", clf.best_params_)

Testing using test data:
              precision    recall  f1-score   support

           0       0.95      0.94      0.94      6361
           1       0.95      0.95      0.95      6660

    accuracy                           0.95     13021
   macro avg       0.95      0.95      0.95     13021
weighted avg       0.95      0.95      0.95     13021

------------------------------------------
Best Parameters: {'dt__criterion': 'gini', 'dt__max_depth': 12}
