### Imports 

In [1]:
import pandas as pd
import numpy as np

# nltk
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

# plotting
import matplotlib.pyplot as plt
from wordcloud import WordCloud

from textblob import Word
from textblob import TextBlob

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

### splitting the train and test csv files of tweets provided by Zindi

In [2]:
train = pd.read_csv('updated_train.csv')
print("Training Set:"% train.columns, train.shape, len(train))
test = pd.read_csv('updated_test.csv')
print("Test Set:"% test.columns, test.shape, len(test))


Training Set: (5287, 3) 5287
Test Set: (1962, 2) 1962


### Data preprocessing

In [3]:
#tweets  = train['text']

def preprocessing(df, text):
    '''data preprocessing function'''
    # convert to string:
    df = df.astype(str)
    
    # remove punctuation
    df[text] = df[text].str.replace(r'[^\w\s]', '')

    # remove underscores not picked up as punctuation above
    df[text] = df[text].str.replace('_', ' ')
    df[text] = df[text].str.replace('#', ' ')
    df[text] = df[text].str.replace('[^a-zA-Z#]', ' ')
    
    # remove  numbers
    df[text] = df[text].str.replace(r'\d[,9][^19]', ' ')

    # convert to lowercase
    df[text] = df[text].apply(lambda x: ' '.join(x.lower() for x in x.split()))
    
    # spell correction
    #df['text'] = df['text'].apply(lambda x: str(TextBlob(x).correct()))
    
    # remove stopwords
    stop_words = stopwords.words("english")
    df[text] = df[text].apply(lambda x: ' '.join(x for x in x.split() if x not in stop_words))
    
    # tokenize
    #tokens = word_tokenize()
    #df[text] = df[text].apply(lambda x:  ' '.join([tokens.tokens]x for x in nltk.word_tokenize))

    # stemming
    stemmer = PorterStemmer()
    #df[text] = df[text].apply(lambda x: [stemmer.stem(i) for i in x])
    df[text] = df[text].apply(lambda x: " ".join([stemmer.stem(word) for word in x.split()]))
    
    # lemmatize (althouh almost similar to stemming)
    df[text] = df[text].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
    
    return df

In [4]:
train = preprocessing(train, 'text')
test = preprocessing(test, 'text')
test

Unnamed: 0,ID,text
0,test_2,explain video take look
1,test_3,ed davey fast ramadan contest
2,test_4,doja cat good miss nicki minaj
3,test_8,bori johnson cheeri wound action persona may s...
4,test_9,man terribl even reason get sport start back
...,...,...
1957,test_2932,fageeru meehaa geyga bandah public fund amp gs...
1958,test_2934,dffn diffus pharmaceut announc pre ind submiss...
1959,test_2936,want wish muslim member congress happi ramadan
1960,test_2937,mean believ conspiraci involv g bill gate micr...


In [5]:
train['text']
# visualize words covid-related and non-covid related top words in the training datasef
#covid_related  = ' '.join([text for text in train['text'][train['target'] == 1]])

#wordcloud = WordCloud(width=500, height=700, random_state=21, max_font_size=100).generate(covid_related)
#plt.imshow(wordcloud, interpolation='bilinear')

0                                 bitcoin halv cancel due
1       mercyofallah good time wrap granular detail ch...
2       day digit india murder e learn g onlin busi re...
3       india like run remain rna kit essenti test one...
4       tough time best way grow learn case teach help...
                              ...                        
5282    spread novel among asylum seeker add pile alar...
5283    hundr jewish patient treat arab practition mig...
5284    beat honestli peopl follow sport fan l share t...
5285    help u reach peopl donat share ramadan flyer d...
5286    interest rate swap deriv price python harbourf...
Name: text, Length: 5287, dtype: object

In [6]:
# see how many tweets are classified are covid related and not covid related
# from the output we have ~50% labelled as covid-related and not-covid-related
train['target'].value_counts()

0    2746
1    2541
Name: target, dtype: int64

### BoW

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

# bow vectorizer
count_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
# bow feature matrix for train 
train_vectors_bow  = count_vectorizer.fit_transform(train['text'])
train_vectors_bow.todense()
# bow of feature matrix for test
test_vectors_bow = count_vectorizer.transform(test['text'])
#test_vectors_bow.todense()

### TFIDF

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

#tdif vectorizer
tdif_vectorizer  = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
# tdif feature matrix for train dataset
train_vectors_tdif = tdif_vectorizer.fit_transform(train['text']) 
# tdif feature matrix for test dataset
test_vectors_tdif  = tdif_vectorizer.transform(test['text'])
test_vectors_tdif.todense().shape

(1962, 1000)

### Splitting of te datasets 

In [20]:
from  sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import BernoulliNB

# bow features from the train
x_train_bow, x_valid_bow, y_train_bow, y_valid_bow = train_test_split(train_vectors_bow, train['target'], random_state=2)
# tdif features for the train
x_train_tdif, x_valid_tdif, y_train_tdif, y_valid_tdif = train_test_split(train_vectors_tdif, train['target'], test_size=0.3, random_state=17)

# logistic regression model
Log_Reg = LogisticRegression(random_state=0, solver='lbfgs')
# fitting the data
Log_Reg.fit(x_train_tdif, y_train_tdif)
# predict probability
predict_bow = Log_Reg.predict(test_vectors_tdif)




output = pd.DataFrame()
output['ID'] = test['ID']
output['target'] = predict_bow
predict_bow
output.to_csv('sample_sub.csv', index=False)

### Naive-Bayes (Bernoulli)

In [13]:
# Vectorize the training set
#word_vectorizer = CountVectorizer()
X_train = count_vectorizer_bow.fit_transform(train['text'])

# Vectorize the testing test
X_test = count_vectorizer_bow.transform(test['text'])

# Our output variable "target" which indicates whether a tweet is diaster tweet
y_train = train['target']

X_train.shape


clf = BernoulliNB()
scores = cross_val_score(clf, X_train, y_train)
print(scores.mean())

clf = BernoulliNB()
clf.fit(X_train, y_train)
y_test = clf.predict(X_test)
output = pd.DataFrame()
output['ID'] = test['ID']
output['target'] = y_test
output.to_csv('sample_sub.csv', index=False)

0.8849997929010782




### Decision Tree

In [14]:
dtc = DecisionTreeClassifier(criterion='entropy', random_state=1)
dtc_scores = cross_val_score(dtc, X_train, y_train)
dtc_scores.mean()

dtc.fit(X_train, y_train)
y_test_dtc = dtc.predict(X_test)
output = pd.DataFrame()
output['ID'] = test["ID"]
output['target']  = y_test_dtc
output.to_csv('sample_sub.csv', index=False)



### Ridge Classifier

In [15]:
ridc = RidgeClassifier()
rc_scores = cross_val_score(ridc, X_train, y_train)
rc_scores.mean()

ridc.fit(X_train, y_train)
y_test_ridc = ridc.predict(X_test)
output = pd.DataFrame()
output['ID']  = test['ID']
output['target'] = y_test_ridc
output.to_csv('sample_sub.csv', index=False)



### Linear Regression

In [16]:
log_r = LogisticRegression()
log_scores = cross_val_score(log_r, X_train, y_train)
print(log_scores.mean())


log_r.fit(X_train, y_train)
y_test_log =log_r.predict(X_test)
output = pd.DataFrame()
output['ID'] = test['ID']
output['target'] = y_test_log
output.to_csv('sample_sub.csv', index=False)

0.8971056369751196




In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

pipeline_sgd = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', SGDClassifier()),
])

In [18]:
# split the training data into the training and test
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, accuracy_score
X_train, X_test, y_train, y_test = train_test_split(train['text'], train['target'], random_state=0)

# fit model
model  = pipeline_sgd.fit(X_train, y_train)

# predict
y_pred = model.predict(X_test)
# print f1_score, precision score, accuracy score
print('F1 Score: {} '.format(f1_score(y_test, y_pred, pos_label='1')))

F1 Score: 0.8986645718774549 


### Predictions on the test dataset

In [None]:
test_pred = model.predict(test['text'])
#np.argmax(test_pred, axis=-1)
#test_pred_int = test_pred[:,1] 
#test_pred_int = test_pred_int.astype(np.int)
test['target'] = test_pred
submission = test[['ID', 'target']]
#submission


submission.to_csv('updated_ss.csv', index=False) # writing data to a CSV file

In [None]:
submission

In [25]:
print('BernoulliNB: {} \n Descision Tree: {} \n Ridge Classifier: {} \n Logistic Regression: {} \n SGDClassifier {}'.format(scores.mean(), 
                                                                                                                         dtc_scores.mean(),
                                                                                                                         rc_scores.mean(),
                                                                                                                         log_scores.mean(),
                                                                                                                         f1_score(y_test, y_pred, pos_label='1')))

BernoulliNB: 0.8849997929010782 
 Descision Tree: 0.8583323192997524 
 Ridge Classifier: 0.8725179516135366 
 Logistic Regression: 0.8971056369751196 
 SGDClassifier 0.8986645718774549
