In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def import_tweets(filename, header = None):
    tweet_dataset =  pd.read_csv(filename, delimiter=',', encoding='latin-1' ,header = header)
    tweet_dataset.columns = ['sentiment','id','date','flag','user','text']
    for i in ['flag','id','user','date']: del tweet_dataset[i] 
    tweet_dataset.sentiment = tweet_dataset.sentiment.replace(4,1)
    return tweet_dataset


def preprocess_tweet(tweet):
    tweet.lower()
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    tweet = re.sub('@[^\s]+','AT_USER', tweet)
    tweet = re.sub('[\s]+', ' ', tweet)
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    return tweet


def feature_extraction(data, method = "tfidf"):
    if method == "tfidf":
        from sklearn.feature_extraction.text import TfidfVectorizer
        tfv=TfidfVectorizer(sublinear_tf=True, stop_words = "english")
        features=tfv.fit_transform(data)
    elif method == "doc2vec":
        None
    else:
        return "Incorrect inputs"
    return features

def train_classifier(features, label, classifier = "logistic_regression"):
    from sklearn.metrics import roc_auc_score 
    if classifier == "logistic_regression": 
        from sklearn.linear_model import LogisticRegression
        model = LogisticRegression(C=1.)
    elif classifier == "naive_bayes":
        from sklearn.naive_bayes import MultinomialNB
        model = MultinomialNB()
    elif classifier == "svm":  
        from sklearn.svm import SVC
        model = SVC()
    else:
        print("Incorrect selection of classifier")
    model.fit(features, label)
    probability_to_be_positive = model.predict(features)
    
    print ("f1_score (train data):" ,f1_score(label, probability_to_be_positive))
    print ("accuracy (train data):" ,accuracy_score(label, probability_to_be_positive))
    print ("auc (train data):" , roc_auc_score(label, probability_to_be_positive))




tweet_dataset = import_tweets("training.csv")

tweet_dataset['text'] = tweet_dataset['text'].apply(preprocess_tweet)

x_train,x_test,y_train,y_test = train_test_split(tweet_dataset['text'],tweet_dataset['sentiment'],test_size = 0.30,shuffle = True)




features_train = feature_extraction(x_train, method = "tfidf") 
features_test = feature_extraction(x_test, method = "tfidf") 






In [2]:
epochs = 5
for t in range(epochs):
    print(train_classifier(features_train, y_train, classifier = "naive_bayes")) 

f1_score (train data): 0.7986276156609068
accuracy (train data): 0.7987678571428571
auc (train data): 0.7987684456722703
None
f1_score (train data): 0.7986276156609068
accuracy (train data): 0.7987678571428571
auc (train data): 0.7987684456722703
None
f1_score (train data): 0.7986276156609068
accuracy (train data): 0.7987678571428571
auc (train data): 0.7987684456722703
None
f1_score (train data): 0.7986276156609068
accuracy (train data): 0.7987678571428571
auc (train data): 0.7987684456722703
None
f1_score (train data): 0.7986276156609068
accuracy (train data): 0.7987678571428571
auc (train data): 0.7987684456722703
None


In [3]:
epochs = 5
for t in range(epochs):
    train_classifier(features_test, y_test, classifier = "naive_bayes")

f1_score (train data): 0.8069704026174641
accuracy (train data): 0.8081354166666667
auc (train data): 0.8081292190962726
f1_score (train data): 0.8069704026174641
accuracy (train data): 0.8081354166666667
auc (train data): 0.8081292190962726
f1_score (train data): 0.8069704026174641
accuracy (train data): 0.8081354166666667
auc (train data): 0.8081292190962726
f1_score (train data): 0.8069704026174641
accuracy (train data): 0.8081354166666667
auc (train data): 0.8081292190962726
f1_score (train data): 0.8069704026174641
accuracy (train data): 0.8081354166666667
auc (train data): 0.8081292190962726
