# Imports + Read Data

In [1]:
base_path = '/Users/joshuawong/Documents/GitHub/' # Change accordingly

os.chdir(base_path+'Fake-News-Detection/Analysis')
from utils import *

In [2]:
train_data = pd.read_csv("../Data/Final datasets/train_data.csv", index_col=0)
val_data = pd.read_csv("../Data/Final datasets/val_data.csv", index_col=0)
test_data = pd.read_csv("../Data/Final datasets/test_data.csv", index_col=0)

In [3]:
X_train_text = train_data["text_preprocessed"].values
y_train = train_data["class_label"].values

X_val_text = val_data["text_preprocessed"].values
y_val = val_data["class_label"].values

X_test_text = test_data["text_preprocessed"].values
y_test = test_data["class_label"].values

# XGBoost

In [23]:
clf = XGBClassifier(eval_metric="logloss", use_label_encoder=False)

## 134 features

### 134 features only

In [34]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.15)
tfidf_vectorizer.fit(X_train_text)

X_train = tfidf_vectorizer.transform(X_train_text)
X_val = tfidf_vectorizer.transform(X_val_text)
X_test = tfidf_vectorizer.transform(X_test_text)

In [35]:
clf.fit(X_train, y_train)

# Test Data
print("Testing using test data:")  
y_test_pred = clf.predict(X_test)
print(classification_report(y_test, y_test_pred))
print("------------------------------------------")

Testing using test data:
              precision    recall  f1-score   support

           0       0.93      0.90      0.91      6361
           1       0.91      0.93      0.92      6660

    accuracy                           0.92     13021
   macro avg       0.92      0.92      0.92     13021
weighted avg       0.92      0.92      0.92     13021

------------------------------------------


### 134 + all added features

In [25]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.15)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'sentence_count', 'prop_unique_words',
    'avg_sentence_length', 'prop_punctuations', 'prop_stopwords',
    'prop_words_in_quotes', 'prop_nouns', 'prop_verbs', 'prop_adjectives',
    'prop_discourse_relations', 'textblob_sentiment'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

X_train_added_features = mapper.fit_transform(train_data)
X_test_added_features = mapper.transform(test_data)



In [26]:
clf.fit(X_train_added_features, y_train)

# Test Data
print("Testing using test data:")
y_test_pred = clf.predict(X_test_added_features)
print(classification_report(y_test, y_test_pred))
print("------------------------------------------")

Testing using test data:
              precision    recall  f1-score   support

           0       0.96      0.92      0.94      6361
           1       0.92      0.97      0.94      6660

    accuracy                           0.94     13021
   macro avg       0.94      0.94      0.94     13021
weighted avg       0.94      0.94      0.94     13021

------------------------------------------


### 134 + selected added features

In [27]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.15)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'prop_unique_words', 'avg_sentence_length', 'prop_punctuations', 'prop_stopwords', 'prop_nouns'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

X_train_added_features = mapper.fit_transform(train_data)
X_test_added_features = mapper.transform(test_data)



In [28]:
clf.fit(X_train_added_features, y_train)

# Test Data
print("Testing using test data:")
y_test_pred = clf.predict(X_test_added_features)
print(classification_report(y_test, y_test_pred))
print("------------------------------------------")

Testing using test data:
              precision    recall  f1-score   support

           0       0.95      0.93      0.94      6361
           1       0.93      0.95      0.94      6660

    accuracy                           0.94     13021
   macro avg       0.94      0.94      0.94     13021
weighted avg       0.94      0.94      0.94     13021

------------------------------------------


## 3k features

### 3k features only

In [49]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.01)
tfidf_vectorizer.fit(X_train_text)

X_train = tfidf_vectorizer.transform(X_train_text)
X_val = tfidf_vectorizer.transform(X_val_text)
X_test = tfidf_vectorizer.transform(X_test_text)

In [50]:
clf.fit(X_train, y_train)

# Test Data
print("Testing using test data:")  
y_test_pred = clf.predict(X_test)
print(classification_report(y_test, y_test_pred))
print("------------------------------------------")

Testing using test data:
              precision    recall  f1-score   support

           0       0.96      0.95      0.95      6361
           1       0.95      0.96      0.96      6660

    accuracy                           0.96     13021
   macro avg       0.96      0.96      0.96     13021
weighted avg       0.96      0.96      0.96     13021

------------------------------------------


### 3k + all added features

In [29]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.01)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'sentence_count', 'prop_unique_words',
    'avg_sentence_length', 'prop_punctuations', 'prop_stopwords',
    'prop_words_in_quotes', 'prop_nouns', 'prop_verbs', 'prop_adjectives',
    'prop_discourse_relations', 'textblob_sentiment'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

X_train_added_features = mapper.fit_transform(train_data)
X_test_added_features = mapper.transform(test_data)



In [30]:
clf.fit(X_train_added_features, y_train)

# Test Data
print("Testing using test data:")
y_test_pred = clf.predict(X_test_added_features)
print(classification_report(y_test, y_test_pred))
print("------------------------------------------")

Testing using test data:
              precision    recall  f1-score   support

           0       0.98      0.97      0.98      6361
           1       0.97      0.98      0.98      6660

    accuracy                           0.98     13021
   macro avg       0.98      0.98      0.98     13021
weighted avg       0.98      0.98      0.98     13021

------------------------------------------


### 3k + selected added features

In [31]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.01)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'prop_unique_words', 'avg_sentence_length', 'prop_punctuations', 'prop_stopwords', 'prop_nouns'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

X_train_added_features = mapper.fit_transform(train_data)
X_test_added_features = mapper.transform(test_data)



In [32]:
clf.fit(X_train_added_features, y_train)

# Test Data
print("Testing using test data:")
y_test_pred = clf.predict(X_test_added_features)
print(classification_report(y_test, y_test_pred))
print("------------------------------------------")

Testing using test data:
              precision    recall  f1-score   support

           0       0.98      0.97      0.98      6361
           1       0.97      0.98      0.98      6660

    accuracy                           0.98     13021
   macro avg       0.98      0.98      0.98     13021
weighted avg       0.98      0.98      0.98     13021

------------------------------------------


# AdaBoost

In [33]:
clf = AdaBoostClassifier()

## 134 Features

### 134 Features only

In [36]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.15)
tfidf_vectorizer.fit(X_train_text)

X_train = tfidf_vectorizer.transform(X_train_text)
X_val = tfidf_vectorizer.transform(X_val_text)
X_test = tfidf_vectorizer.transform(X_test_text)

In [37]:
clf.fit(X_train, y_train)

# Test Data
print("Testing using test data:")  
y_test_pred = clf.predict(X_test)
print(classification_report(y_test, y_test_pred))
print("------------------------------------------")

Testing using test data:
              precision    recall  f1-score   support

           0       0.93      0.90      0.91      6361
           1       0.91      0.93      0.92      6660

    accuracy                           0.92     13021
   macro avg       0.92      0.92      0.92     13021
weighted avg       0.92      0.92      0.92     13021

------------------------------------------


### 134 + all added features

In [40]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.15)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'sentence_count', 'prop_unique_words',
    'avg_sentence_length', 'prop_punctuations', 'prop_stopwords',
    'prop_words_in_quotes', 'prop_nouns', 'prop_verbs', 'prop_adjectives',
    'prop_discourse_relations', 'textblob_sentiment'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

X_train_added_features = mapper.fit_transform(train_data)
X_test_added_features = mapper.transform(test_data)



In [41]:
clf.fit(X_train_added_features, y_train)

# Test Data
print("Testing using test data:")
y_test_pred = clf.predict(X_test_added_features)
print(classification_report(y_test, y_test_pred))
print("------------------------------------------")

Testing using test data:
              precision    recall  f1-score   support

           0       0.92      0.92      0.92      6361
           1       0.92      0.93      0.92      6660

    accuracy                           0.92     13021
   macro avg       0.92      0.92      0.92     13021
weighted avg       0.92      0.92      0.92     13021

------------------------------------------


### 134 + selected added features

In [38]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.15)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'prop_unique_words', 'avg_sentence_length', 'prop_punctuations', 'prop_stopwords', 'prop_nouns'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

X_train_added_features = mapper.fit_transform(train_data)
X_test_added_features = mapper.transform(test_data)



In [39]:
clf.fit(X_train_added_features, y_train)

# Test Data
print("Testing using test data:")
y_test_pred = clf.predict(X_test_added_features)
print(classification_report(y_test, y_test_pred))
print("------------------------------------------")

Testing using test data:
              precision    recall  f1-score   support

           0       0.92      0.92      0.92      6361
           1       0.92      0.92      0.92      6660

    accuracy                           0.92     13021
   macro avg       0.92      0.92      0.92     13021
weighted avg       0.92      0.92      0.92     13021

------------------------------------------


## 3k features

### 3k features only

In [42]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.01)
tfidf_vectorizer.fit(X_train_text)

X_train = tfidf_vectorizer.transform(X_train_text)
X_val = tfidf_vectorizer.transform(X_val_text)
X_test = tfidf_vectorizer.transform(X_test_text)

In [43]:
clf.fit(X_train, y_train)

# Validation Data
print("Testing using validation data:")    
y_val_pred = clf.predict(X_val)
print(classification_report(y_val, y_val_pred))
print("------------------------------------------")

# Test Data
print("Testing using test data:")  
y_test_pred = clf.predict(X_test)
print(classification_report(y_test, y_test_pred))
print("------------------------------------------")

Testing using validation data:
              precision    recall  f1-score   support

           0       0.96      0.94      0.95      6361
           1       0.95      0.96      0.96      6659

    accuracy                           0.95     13020
   macro avg       0.95      0.95      0.95     13020
weighted avg       0.95      0.95      0.95     13020

------------------------------------------
Testing using test data:
              precision    recall  f1-score   support

           0       0.96      0.95      0.95      6361
           1       0.95      0.96      0.96      6660

    accuracy                           0.96     13021
   macro avg       0.96      0.96      0.96     13021
weighted avg       0.96      0.96      0.96     13021

------------------------------------------


### 3k + all added features

In [44]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.01)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'sentence_count', 'prop_unique_words',
    'avg_sentence_length', 'prop_punctuations', 'prop_stopwords',
    'prop_words_in_quotes', 'prop_nouns', 'prop_verbs', 'prop_adjectives',
    'prop_discourse_relations', 'textblob_sentiment'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

X_train_added_features = mapper.fit_transform(train_data)
X_test_added_features = mapper.transform(test_data)



In [45]:
clf.fit(X_train_added_features, y_train)

# Test Data
print("Testing using test data:")
y_test_pred = clf.predict(X_test_added_features)
print(classification_report(y_test, y_test_pred))
print("------------------------------------------")

Testing using test data:
              precision    recall  f1-score   support

           0       0.96      0.95      0.95      6361
           1       0.95      0.96      0.96      6660

    accuracy                           0.95     13021
   macro avg       0.95      0.95      0.95     13021
weighted avg       0.95      0.95      0.95     13021

------------------------------------------


### 3k + selected added features

In [47]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,2), min_df=0.01)

# Create mapper object to combine added features and tfidf word vectors
mapper = DataFrameMapper([
    (['char_count', 'word_count', 'prop_unique_words', 'avg_sentence_length', 'prop_punctuations', 'prop_stopwords', 'prop_nouns'], None), 
    ('text_preprocessed', tfidf_vectorizer)
])

X_train_added_features = mapper.fit_transform(train_data)
X_test_added_features = mapper.transform(test_data)



In [48]:
clf.fit(X_train_added_features, y_train)

# Test Data
print("Testing using test data:")
y_test_pred = clf.predict(X_test_added_features)
print(classification_report(y_test, y_test_pred))
print("------------------------------------------")

Testing using test data:
              precision    recall  f1-score   support

           0       0.96      0.95      0.95      6361
           1       0.95      0.96      0.96      6660

    accuracy                           0.95     13021
   macro avg       0.95      0.95      0.95     13021
weighted avg       0.95      0.95      0.95     13021

------------------------------------------
