In [24]:
import pandas as pd
import seaborn as sns
import sys
import re
import math
import nltk
import numpy as np
import pandas as pd
import warnings
from collections import Counter, defaultdict
from datetime import datetime, timedelta
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer, sent_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_halving_search_cv 
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

In [16]:
def warn(*args, **kwargs):
    pass

warnings.warn = warn

In [15]:
RANDOM_STATE = 42
RF_N_ESTIMATORS = 500
RF_MIN_SAMPLES_SPLIT = 15

In [2]:
raw_train_df = pd.read_csv('https://raw.githubusercontent.com/MiHarsh/Public_stuffs/master/Constraint_English_Train%20-%20Sheet1.csv')
raw_test_df = pd.read_csv('https://raw.githubusercontent.com/MiHarsh/Public_stuffs/master/Constraint_English_Val%20-%20Sheet1.csv')

In [3]:
print(raw_train_df.shape)
raw_train_df.sample(n=3)

(6420, 3)


Unnamed: 0,id,tweet,label
4320,4321,.@CMSGov &amp; @CDCgov announce that reimburse...,real
5682,5683,TB program staff across the U.S. are respondin...,real
4297,4298,Big end of night update. @GavinNewsom put out ...,real


## <font color='darkblue'>Preprocess</font>

### Feature engineering

In [4]:
def count_chars(text):
    return len(text)

def count_words(text):
    return len(text.split())

def count_capital_chars(text):
    count=0
    for i in text:
        if i.isupper():
            count+=1
    return count

def count_capital_words(text):
    return sum(map(str.isupper,text.split()))

def count_punctuations(text):
    punctuations='!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~'
    d=dict()
    for i in punctuations:
        d[str(i)+' count']=text.count(i)
    return d 

def count_words_in_quotes(text):
    x = re.findall("'.'|\".\"", text)
    count=0
    if x is None:
        return 0
    else:
        for i in x:
            t=i[1:-1]
            count+=count_words(t)
        return count
    
def count_sent(text):
    return len(nltk.sent_tokenize(text))

def count_unique_words(text):
    return len(set(text.split()))

def count_htags(text):
    x = re.findall(r'(#w[A-Za-z0-9]*)', text)
    return len(x)

def count_mentions(text):
    x = re.findall(r'(@w[A-Za-z0-9]*)', text)
    return len(x)

def count_stopwords(text):
    stop_words = set(stopwords.words('english'))  
    word_tokens = word_tokenize(text)
    stopwords_x = [w for w in word_tokens if w in stop_words]
    return len(stopwords_x)

def gen_features(df):
    df['char_count'] = df["tweet"].apply(lambda x:count_chars(x))
    df['word_count'] = df["tweet"].apply(lambda x:count_words(x))
    df['sent_count'] = df["tweet"].apply(lambda x:count_sent(x))
    df['capital_char_count'] = df["tweet"].apply(lambda x:count_capital_chars(x))
    df['capital_word_count'] = df["tweet"].apply(lambda x:count_capital_words(x))
    df['quoted_word_count'] = df["tweet"].apply(lambda x:count_words_in_quotes(x))
    df['stopword_count'] = df["tweet"].apply(lambda x:count_stopwords(x))
    df['unique_word_count'] = df["tweet"].apply(lambda x:count_unique_words(x))
    df['htag_count'] = df["tweet"].apply(lambda x:count_htags(x))
    df['mention_count'] = df["tweet"].apply(lambda x:count_mentions(x))
    df['punct_count'] = df["tweet"].apply(lambda x:count_punctuations(x))
    df['avg_wordlength'] = df['char_count']/df['word_count']
    df['avg_sentlength'] = df['word_count']/df['sent_count']
    df['unique_vs_words'] = df['unique_word_count']/df['word_count']
    df['stopwords_vs_words'] = df['stopword_count']/df['word_count']
    return df

In [5]:
train_df = gen_features(raw_train_df)
test_df = gen_features(raw_test_df)

In [6]:
print(train_df.shape)
train_df.sample(n=3)

(6420, 18)


Unnamed: 0,id,tweet,label,char_count,word_count,sent_count,capital_char_count,capital_word_count,quoted_word_count,stopword_count,unique_word_count,htag_count,mention_count,punct_count,avg_wordlength,avg_sentlength,unique_vs_words,stopwords_vs_words
3739,3740,However only 14% of countries reported removin...,real,178,21,1,14,2,0,5,21,0,0,"{'! count': 0, '"" count': 0, '# count': 1, '$ ...",8.47619,21.0,1.0,0.238095
4339,4340,Various steps taken by GOI for removal of all ...,real,305,42,5,45,5,0,12,38,0,0,"{'! count': 0, '"" count': 0, '# count': 5, '$ ...",7.261905,8.4,0.904762,0.285714
6256,6257,Feds’ $20 million mask contract to a shady man...,real,97,15,2,10,1,0,5,15,0,0,"{'! count': 0, '"" count': 0, '# count': 0, '$ ...",6.466667,7.5,1.0,0.333333


In [7]:
test_punct_df = pd.DataFrame(list(test_df.punct_count))
train_punct_df = pd.DataFrame(list(train_df.punct_count))
train_punct_df.head(n=3)

Unnamed: 0,! count,""" count",# count,$ count,% count,& count,' count,( count,) count,* count,...,[ count,\ count,] count,^ count,_ count,` count,{ count,| count,} count,~ count
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,2,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Merging pnctuation DataFrame with main DataFrame
train_df = pd.merge(train_df, train_punct_df, left_index=True, right_index=True)
test_df = pd.merge(test_df, test_punct_df, left_index=True, right_index=True)

Unnamed: 0,id,tweet,label,char_count,word_count,sent_count,capital_char_count,capital_word_count,quoted_word_count,stopword_count,...,[ count,\ count,] count,^ count,_ count,` count,{ count,| count,} count,~ count
3194,3195,The existence of a canine coronavirus vaccine ...,fake,104,17,1,1,0,0,8,...,0,0,0,0,0,0,0,0,0,0
5637,5638,A post says that the USA arrested the man who ...,fake,82,16,1,5,2,0,6,...,0,0,0,0,0,0,0,0,0,0
3306,3307,#CoronaVirusUpdates #IndiaFightsCorona India’s...,real,270,24,2,52,2,0,4,...,0,0,0,0,1,0,0,0,0,0


In [9]:
print(train_df.shape)
train_df.sample(n=3)

(6420, 50)


Unnamed: 0,id,tweet,label,char_count,word_count,sent_count,capital_char_count,capital_word_count,quoted_word_count,stopword_count,...,[ count,\ count,] count,^ count,_ count,` count,{ count,| count,} count,~ count
4872,4873,RT @PIB_India: #CoronaWatch ◾ 3044940 total co...,real,129,18,1,9,1,0,0,...,0,0,0,0,1,0,0,0,0,0
6240,6241,Together with @gavi and the @CEPIvaccines we a...,real,230,32,1,24,4,0,13,...,0,0,0,0,0,0,0,0,0,0
1384,1385,#IndiaFightsCorona: 1054 case fatalities have ...,real,291,38,3,29,0,0,9,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# We can drop "punct_count" column from both df and test DataFrame
train_df.drop(columns=['punct_count'], inplace=True)
test_df.drop(columns=['punct_count'], inplace=True)
train_df.columns

Index(['id', 'tweet', 'label', 'char_count', 'word_count', 'sent_count',
       'capital_char_count', 'capital_word_count', 'quoted_word_count',
       'stopword_count', 'unique_word_count', 'htag_count', 'mention_count',
       'avg_wordlength', 'avg_sentlength', 'unique_vs_words',
       'stopwords_vs_words', '! count', '" count', '# count', '$ count',
       '% count', '& count', '' count', '( count', ') count', '* count',
       '+ count', ', count', '- count', '. count', '/ count', ': count',
       '; count', '< count', '= count', '> count', '? count', '@ count',
       '[ count', '\ count', '] count', '^ count', '_ count', '` count',
       '{ count', '| count', '} count', '~ count'],
      dtype='object')

### Re-processing
We performed a simple pre-processing step, like removing links, removing user name, numbers, double space, punctuation, lower casing, etc.

In [11]:
def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet = re.sub(r'httpS+', '', tweet) # remove http links
    tweet = re.sub(r'bit.ly/S+', '', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet
def remove_users(tweet):
    '''Takes a string and removes retweet and @user information'''
    tweet = re.sub('(RTs@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove retweet
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove tweeted at
    return tweet

my_punctuation = '!"$%&\'()*+,-./:;<=>?[\]^_`{|}~•@'
    
def preprocess(sent):
    sent = remove_users(sent)
    sent = remove_links(sent)
    sent = sent.lower() # lower case
    sent = re.sub('['+my_punctuation + ']+', ' ', sent) # strip punctuation
    sent = re.sub('s+', ' ', sent) #remove double spacing
    sent = re.sub('([0-9]+)', '', sent) # remove numbers
    sent_token_list = [word for word in sent.split(' ')]
    sent = ' '.join(sent_token_list)
    return sent

train_df['tweet']   = train_df['tweet'].apply(lambda x: preprocess(x))
test_df['tweet'] = test_df['tweet'].apply(lambda x: preprocess(x))

In [12]:
train_df.columns

Index(['id', 'tweet', 'label', 'char_count', 'word_count', 'sent_count',
       'capital_char_count', 'capital_word_count', 'quoted_word_count',
       'stopword_count', 'unique_word_count', 'htag_count', 'mention_count',
       'avg_wordlength', 'avg_sentlength', 'unique_vs_words',
       'stopwords_vs_words', '! count', '" count', '# count', '$ count',
       '% count', '& count', '' count', '( count', ') count', '* count',
       '+ count', ', count', '- count', '. count', '/ count', ': count',
       '; count', '< count', '= count', '> count', '? count', '@ count',
       '[ count', '\ count', '] count', '^ count', '_ count', '` count',
       '{ count', '| count', '} count', '~ count'],
      dtype='object')

In [13]:
%%time
vectorizer            =  TfidfVectorizer()
train_tf_idf_features =  vectorizer.fit_transform(train_df['tweet']).toarray()
test_tf_idf_features  =  vectorizer.transform(test_df['tweet']).toarray()

# Converting above list to DataFrame
train_tf_idf_df          = pd.DataFrame(train_tf_idf_features)
test_tf_idf_df           = pd.DataFrame(test_tf_idf_features)

# Saparating train and test labels from all features
y_train               = train_df['label']
y_test                = test_df['label']

#Listing all features
features = ['char_count', 'word_count', 'sent_count',
       'capital_char_count', 'capital_word_count', 'quoted_word_count',
       'stopword_count', 'unique_word_count', 'htag_count', 'mention_count',
       'avg_wordlength', 'avg_sentlength', 'unique_vs_words',
       'stopwords_vs_words', '! count', '" count', '# count', '$ count',
       '% count', '& count', '\' count', '( count', ') count', '* count',
       '+ count', ', count', '- count', '. count', '/ count', ': count',
       '; count', '< count', '= count', '> count', '? count', '@ count',
       '[ count', '\ count', '] count', '^ count', '_ count', '` count',
       '{ count', '| count', '} count', '~ count']

# Finally merging all features with above TF-IDF. 
X_train = pd.merge(train_tf_idf_df, train_df[features], left_index=True, right_index=True)
X_test  = pd.merge(test_tf_idf_df, test_df[features], left_index=True, right_index=True)

CPU times: user 3.21 s, sys: 1.24 s, total: 4.45 s
Wall time: 4.46 s


In [14]:
print(X_train.shape)
X_train.sample(n=3)

(6420, 13918)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,[ count,\ count,] count,^ count,_ count,` count,{ count,| count,} count,~ count
5495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,2,0,0,0,0,0
1114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


## <font color='darkblue'>Feature reduction (PCA)</font>
* **[Principal component analysis](https://en.wikipedia.org/wiki/Principal_component_analysis) (PCA).**
> Principal component analysis (PCA) is the process of computing the principal components and using them to perform a change of basis on the data, sometimes using only the first few principal components and ignoring the rest.

From sklearn, we can leverage [**sklearn.decomposition.PCA**](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html) to achieve this task:

In [18]:
%%time
# ML performance with  full features:
clf_model = RandomForestClassifier(
    n_estimators = RF_N_ESTIMATORS, min_samples_split = RF_MIN_SAMPLES_SPLIT, random_state = RANDOM_STATE)
clf_model.fit(X_train, y_train)

CPU times: user 1min 44s, sys: 715 ms, total: 1min 45s
Wall time: 1min 52s


In [22]:
y_pred = clf_model.predict(X_test)

In [23]:
print(f'Accuracy: {accuracy_score(y_test, y_pred):.01%}')
print(classification_report(y_test, y_pred))

Accuracy: 93.2%
              precision    recall  f1-score   support

        fake       0.96      0.89      0.93      1020
        real       0.91      0.97      0.94      1120

    accuracy                           0.93      2140
   macro avg       0.94      0.93      0.93      2140
weighted avg       0.93      0.93      0.93      2140



#### PCA(1000)

In [39]:
# Leverage PCA to reduce feature size to 1000
NUM_FEATURE = 1000
PCA_COLUMNS = columns=[f'pca_{i}' for i in range(NUM_FEATURE)]
pca = PCA(n_components=NUM_FEATURE)
pca.fit(X_train)

In [33]:
print('Top-10 components variance:')
for i in range(10):
    print(f'\t{pca.explained_variance_ratio_[i]:.02%}')

Top-10 components variance:
	98.8%
	0.6%
	0.3%
	0.1%
	0.1%
	0.0%
	0.0%
	0.0%
	0.0%
	0.0%


In [40]:
X_train_1000_df = pd.DataFrame(pca.transform(X_train),
                               columns=PCA_COLUMNS)
X_test_1000_df = pd.DataFrame(pca.transform(X_test),
                              columns=PCA_COLUMNS)

In [41]:
print(X_train_1000_df.shape)
X_train_1000_df.sample(n=3)

(6420, 1000)


Unnamed: 0,pca_0,pca_1,pca_2,pca_3,pca_4,pca_5,pca_6,pca_7,pca_8,pca_9,...,pca_990,pca_991,pca_992,pca_993,pca_994,pca_995,pca_996,pca_997,pca_998,pca_999
4464,88.163029,-3.334165,4.46664,-4.292164,-3.012841,0.947056,-0.037998,1.023775,0.660819,0.027674,...,-0.001205,-0.010569,0.004029,-0.023748,0.032786,-0.000737,-0.009473,-0.006623,-0.017635,-0.017024
50,121.833273,4.662563,9.123641,1.584153,-0.63118,-3.295858,-2.722441,-0.275923,-2.106189,-0.563477,...,0.011841,0.011209,0.008758,-0.015616,-0.000187,-0.001497,0.005261,-0.004174,-0.013675,0.006325
4326,-80.689867,-4.303051,-4.007151,2.051568,2.419079,1.203917,0.361049,-0.352929,0.15006,-0.191267,...,0.008785,0.006376,-0.005642,0.014353,0.006251,0.016512,-0.001091,-0.004603,-0.014451,-0.00341


In [42]:
%%time
clf_model = RandomForestClassifier(
    n_estimators = RF_N_ESTIMATORS, min_samples_split = RF_MIN_SAMPLES_SPLIT, random_state = RANDOM_STATE)
clf_model.fit(X_train_1000_df, y_train)

CPU times: user 2min 11s, sys: 527 ms, total: 2min 12s
Wall time: 2min 14s


In [44]:
y_pred = clf_model.predict(X_test_1000_df)

In [45]:
print(f'Accuracy: {accuracy_score(y_test, y_pred):.01%}')
print(classification_report(y_test, y_pred))

Accuracy: 88.9%
              precision    recall  f1-score   support

        fake       0.96      0.80      0.87      1020
        real       0.84      0.97      0.90      1120

    accuracy                           0.89      2140
   macro avg       0.90      0.88      0.89      2140
weighted avg       0.90      0.89      0.89      2140



#### PCA(2000)

In [46]:
def pca_test(feature_size):
    # Leverage PCA to reduce feature size to 1000
    PCA_COLUMNS = columns=[f'pca_{i}' for i in range(feature_size)]
    pca = PCA(n_components=feature_size)
    pca.fit(X_train)
    _X_train_pca_df = pd.DataFrame(pca.transform(X_train),
                                   columns=PCA_COLUMNS)
    _X_test_pca_df = pd.DataFrame(pca.transform(X_test),
                                  columns=PCA_COLUMNS)
    st = datetime.now()
    clf_model = RandomForestClassifier(
        n_estimators = RF_N_ESTIMATORS, min_samples_split = RF_MIN_SAMPLES_SPLIT,
        random_state = RANDOM_STATE)
    clf_model.fit(_X_train_pca_df, y_train)
    train_time = datetime.now() - st
    y_pred = clf_model.predict(_X_test_pca_df)
    return accuracy_score(y_test, y_pred), train_time, clf_model

In [49]:
accuracy, train_time, model = pca_test(100)

In [50]:
print(f'Feature size=100 leads to model accuracy={accuracy:.01%} and training time={train_time}')

Feature size=100 leads to model accuracy=91.7% and training time=0:00:50.196046
