- Database: Covid-fake
- Function: cleaning
- Desp: NA

In [None]:
# Start writing code here...
import pandas as pd
import re

In [None]:
train = pd.read_csv('../datasets/covid/Constraint_English_Train - Sheet1.csv')
test = pd.read_csv('../datasets/covid/Constraint_English_Val - Sheet1.csv')

In [None]:
train.head(10)

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real
5,6,"Covid Act Now found ""on average each person in...",real
6,7,If you tested positive for #COVID19 and have n...,real
7,8,Obama Calls Trump’s Coronavirus Response A Cha...,fake
8,9,"???Clearly, the Obama administration did not l...",fake
9,10,Retraction—Hydroxychloroquine or chloroquine w...,fake


In [None]:
labels = ['fake','real']
def label_encode(val):
    return labels.index(val)

In [None]:
train.label = train.label.apply(label_encode)
test.label = test.label.apply(label_encode)

In [None]:
train.head(10)

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,1
1,2,States reported 1121 deaths a small rise from ...,1
2,3,Politically Correct Woman (Almost) Uses Pandem...,0
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,1
4,5,Populous states can generate large case counts...,1
5,6,"Covid Act Now found ""on average each person in...",1
6,7,If you tested positive for #COVID19 and have n...,1
7,8,Obama Calls Trump’s Coronavirus Response A Cha...,0
8,9,"???Clearly, the Obama administration did not l...",0
9,10,Retraction—Hydroxychloroquine or chloroquine w...,0


In [None]:
test.head(5)

Unnamed: 0,id,tweet,label
0,1,Chinese converting to Islam after realising th...,0
1,2,11 out of 13 people (from the Diamond Princess...,0
2,3,"COVID-19 Is Caused By A Bacterium, Not Virus A...",0
3,4,Mike Pence in RNC speech praises Donald Trump’...,0
4,5,6/10 Sky's @EdConwaySky explains the latest #C...,1


In [None]:
train = train.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = []

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
train.tweet = train.tweet.apply(clean_text)
train.tweet = train.tweet.str.replace('\d+', '')
#Applying similar cleaning to test data
test = test.reset_index(drop=True)
test.tweet = test.tweet.apply(clean_text)
test.tweet = test.tweet.str.replace('\d+','')

In [None]:
train.tweet.sample(10)

3406    early identification through aggressive testin...
2899    no states are currently on track to contain co...
595     republicans promise americans theirs will be t...
2089    says nancy pelosi was caught trying to include...
1019                            cocaine kills coronavirus
5781    update from the minhealthnz there are no new c...
6016    such an approach has started showing results a...
3525    starting in march if you travel outside of the...
507     #indiafightscorona mha issues new guidelines #...
1835    most key indicators used to track #covid such ...
Name: tweet, dtype: object

In [None]:
test.tweet.sample(10)

193     our hospitalization data became very solid wit...
380     the rate was x that of communityacquired #covi...
1419    rt drharshvardhan #covid update  states uts ha...
329      hour lockdown in telangana state from this tu...
1723    if mississippi continues to #stayathome hospit...
1859     million chinese people convert to islam after...
1213    black americans are hospitalized for coronavir...
694      new cases of #covidnigeria lagos fct abia bor...
1251    important safety measures for #parents do not ...
789     there are up to  possible close contacts being...
Name: tweet, dtype: object

In [None]:
#Now training and testing data, both have been cleaned.
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm


In [None]:
#Split data set into validation
train_tweet, valid_tweet, train_tweet_label, valid_tweet_label = model_selection.train_test_split(train['tweet'], train['label'],test_size=0.33, random_state=42)


In [None]:
train_tweet

621     sadly there are people with covid in hospital ...
5829    a woman has fallen out of a moving car on to a...
566     i live in bolton i was offered one in invernes...
1837    for a breakdown of cases by states in real tim...
1334    new cases of #covidnigeria fct lagos plateau o...
                              ...                        
3772    people who are sick with coronavirus continue ...
5191    madagascar was covid free on april after regis...
5226    in the third week who identified amp began con...
5390    news smokers cant wait for pubs to reopen so t...
860     pronxt mohfw_india as per goi for #covid cases...
Name: tweet, Length: 4301, dtype: object

In [None]:
#Generate feature vectors using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


In [None]:
# word(unigram) level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(train['tweet'])
train_tweet_tfidf =  tfidf_vect.transform(train_tweet)
val_tweet_tfidf =  tfidf_vect.transform(valid_tweet)
test_tweet_tfidf = tfidf_vect.transform(test['tweet'])

In [None]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, valid_y, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [None]:
accuracy = train_model(svm.SVC(), train_tweet_tfidf, train_tweet_label, val_tweet_tfidf,valid_tweet_label)
print ("SVM, UniGram Vectors:(Val) "), accuracy

SVM, UniGram Vectors:(Val) 


(None, 0.931571495988674)

In [None]:
#Predicting accuracy of SVM on test data under unigram tf-idf feature vectors
accuracy_test_svm_uni = train_model(svm.SVC(), train_tweet_tfidf, train_tweet_label, test_tweet_tfidf,test['label'])
print ("SVM, Unigram test accuracy: ",accuracy_test_svm_uni)

SVM, Unigram test accuracy:  0.9205607476635514
