In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scikitplot.metrics import plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from time import time
from sklearn.metrics import accuracy_score


In [2]:
import pandas as pd
df = pd.read_csv("../input/clean-16-million-tweets/clean_tweets_again.csv")

In [3]:
df.dropna(inplace = True)
df.drop(columns = 'Unnamed: 0',inplace = True)

In [4]:
X = df['text']
y = df['sentiment']

# here data is split into three chunks namely train set,developement set and test set
#### Train set:sample data that is used for learning
### Developement set:the data used to tune the hyperparameters of the algorithm.
### test set:the data used to test the model

In [5]:
# the data is split in 98|1|1

from sklearn.model_selection import train_test_split
X_train,X_validation_test,y_train,y_validation_test = train_test_split(X,y,test_size = 0.02,random_state = 45)

In [6]:
X_validation,X_test,y_validation,y_test = train_test_split(X_validation_test,y_validation_test,test_size = 0.5,random_state = 45)

In [7]:
train_df = pd.DataFrame(y_train)
percent_positive = len(train_df[train_df['sentiment'] == 1])/len(train_df)
percent_negative = len(train_df[train_df['sentiment'] == 0])/len(train_df)
print('The shape of training set is',len(X_train),'with positive entries as',percent_positive*100,'and negative entries as',percent_negative*100,'\n')
validation_df = pd.DataFrame(y_validation)
percent_positive = len(validation_df[validation_df['sentiment'] == 1])/len(validation_df)
percent_negative = len(validation_df[validation_df['sentiment'] == 0])/len(validation_df)
print('The shape of validation set is',len(X_validation),'with positive entries as',percent_positive*100,'and negative entries as',percent_negative *100,'\n')
test_df = pd.DataFrame(y_test)
percent_positive = len(test_df[test_df['sentiment'] == 1])/len(test_df)
percent_negative = len(test_df[test_df['sentiment'] == 0])/len(test_df)
print('The shape of test set is',len(X_test),'with positive entries as',percent_positive*100,'and negative entries as',percent_negative*100)

The shape of training set is 1564120 with positive entries as 49.99469350177736 and negative entries as 50.005306498222645 

The shape of validation set is 15960 with positive entries as 49.85588972431078 and negative entries as 50.14411027568922 

The shape of test set is 15961 with positive entries as 49.55829835223357 and negative entries as 50.441701647766436


**during comparision of various machine learning algorithms baseline is used as point of refrence to compare for this we will be using the zero classifier which only identifies majority class even though there is predicting power in this classifier this will be useful in determining the baseline**

**another baseline for the classification of data is text blob which has a builtin sentiment classifier**

In [8]:
if len(X_test[y_test == 0])/len(X_test) > 0.5:
    null_accuracy = len(X_test[y_test == 0])/len(X_test)
else:
    null_accuracy = 1 - len(X_test[y_test == 0])/len(X_test)

number_of_features = np.arange(10000,100001,10000)
clf = LogisticRegression(n_jobs = -1,verbose = 8)
count_vec = TfidfVectorizer()
result = []
print(clf)
print('\n')
for number in number_of_features:
    count_vec.set_params(stop_words = None,max_features = number,ngram_range = (1,1))
    pipeline = Pipeline([('vectorizer',count_vec),('classifier',clf)],verbose = 1)
    print('validation result for {} features'.format(number))
    t0 = time()
    pipeline.fit(X_train,y_train)
    pred = pipeline.predict(X_test)
    train_test_time = time() - t0
    accuracy = accuracy_score(pred,y_test)
    print('the training and testing time is ',train_test_time,' seconds')
    if accuracy > null_accuracy:
        print('accurcy',accuracy,' greatet than null accuracy ',null_accuracy)
    else:
        print('accurcy',accuracy,' less than null accuracy ',null_accuracy)
result.append((number,accuracy,train_test_time))
        

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=8,
                   warm_start=False)


validation result for 10000 features
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  40.0s


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   37.1s finished


[Pipeline] ........ (step 2 of 2) Processing classifier, total=  37.2s
the training and testing time is  77.71299409866333  seconds
accurcy 0.8008270158511371  greatet than null accuracy  0.5044170164776643
validation result for 20000 features
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  39.0s


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   38.8s finished


[Pipeline] ........ (step 2 of 2) Processing classifier, total=  38.9s
the training and testing time is  78.23821806907654  seconds
accurcy 0.8037090407869181  greatet than null accuracy  0.5044170164776643
validation result for 30000 features
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  38.7s


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   41.9s finished


[Pipeline] ........ (step 2 of 2) Processing classifier, total=  42.0s
the training and testing time is  81.1456995010376  seconds
accurcy 0.804460873378861  greatet than null accuracy  0.5044170164776643
validation result for 40000 features
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  38.6s


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   45.9s finished


[Pipeline] ........ (step 2 of 2) Processing classifier, total=  46.0s
the training and testing time is  85.07326579093933  seconds
accurcy 0.8052753586867991  greatet than null accuracy  0.5044170164776643
validation result for 50000 features
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  38.8s


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   47.4s finished


[Pipeline] ........ (step 2 of 2) Processing classifier, total=  47.5s
the training and testing time is  86.73376631736755  seconds
accurcy 0.8042729152308753  greatet than null accuracy  0.5044170164776643
validation result for 60000 features
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  38.7s


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   50.5s finished


[Pipeline] ........ (step 2 of 2) Processing classifier, total=  50.6s
the training and testing time is  89.7258951663971  seconds
accurcy 0.8040849570828895  greatet than null accuracy  0.5044170164776643
validation result for 70000 features
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  39.0s


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   54.9s finished


[Pipeline] ........ (step 2 of 2) Processing classifier, total=  55.0s
the training and testing time is  94.39271235466003  seconds
accurcy 0.8048994423908277  greatet than null accuracy  0.5044170164776643
validation result for 80000 features
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  39.7s


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   57.2s finished


[Pipeline] ........ (step 2 of 2) Processing classifier, total=  57.3s
the training and testing time is  97.4627673625946  seconds
accurcy 0.80552596955078  greatet than null accuracy  0.5044170164776643
validation result for 90000 features
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  38.9s


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   59.6s finished


[Pipeline] ........ (step 2 of 2) Processing classifier, total=  59.7s
the training and testing time is  99.04488325119019  seconds
accurcy 0.8052753586867991  greatet than null accuracy  0.5044170164776643
validation result for 100000 features
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  38.9s


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  1.1min finished


[Pipeline] ........ (step 2 of 2) Processing classifier, total= 1.1min
the training and testing time is  104.06822800636292  seconds
accurcy 0.8062151494267277  greatet than null accuracy  0.5044170164776643


In [9]:
print('computation with unigram and without stopwords')
if len(X_test[y_test == 0])/len(X_test) > 0.5:
    null_accuracy = len(X_test[y_test == 0])/len(X_test)
else:
    null_accuracy = 1 - len(x_test[y_test == 0])/len(X_test)
number_of_features = np.arange(10000,100001,10000)
clf = LogisticRegression(n_jobs = -1,verbose = 8)
count_vec = TfidfVectorizer(stop_words = 'english')
result_unigram_without_stopwords = []
print(clf)
print('\n')
for number in number_of_features:
    count_vec.set_params(stop_words = 'english',max_features = number,ngram_range = (1,1))
    pipeline = Pipeline([('vectorizer',count_vec),('classifier',clf)],verbose = 1)
    print('validation result for {} features'.format(number))
    t0 = time()
    pipeline.fit(X_train,y_train)
    pred = pipeline.predict(X_test)
    train_test_time = time() - t0
    accuracy = accuracy_score(pred,y_test)
    print('the training and testing time is ',train_test_time,' seconds')
    if accuracy > null_accuracy:
        print('accuracy',accuracy,'greater then null accuracy',null_accuracy)
    else:
        print('accuracy',accuracy,'less than null accuracy',null_accuracy)
    print('\n')  
result_unigram_without_stopwords.append((number,accuracy,train_test_time))

computation with unigram and without stopwords
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=8,
                   warm_start=False)


validation result for 10000 features
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  35.0s


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   33.1s finished


[Pipeline] ........ (step 2 of 2) Processing classifier, total=  33.1s
the training and testing time is  68.61607813835144  seconds
accuracy 0.7728212518012656 greater then null accuracy 0.5044170164776643


validation result for 20000 features
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  34.7s


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   35.7s finished


[Pipeline] ........ (step 2 of 2) Processing classifier, total=  35.8s
the training and testing time is  70.95119047164917  seconds
accuracy 0.7741369588371656 greater then null accuracy 0.5044170164776643


validation result for 30000 features
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  35.1s


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   35.1s finished


[Pipeline] ........ (step 2 of 2) Processing classifier, total=  35.2s
the training and testing time is  70.75848126411438  seconds
accuracy 0.7753900131570703 greater then null accuracy 0.5044170164776643


validation result for 40000 features
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  34.7s


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   40.3s finished


[Pipeline] ........ (step 2 of 2) Processing classifier, total=  40.4s
the training and testing time is  75.48648738861084  seconds
accuracy 0.7757032767370465 greater then null accuracy 0.5044170164776643


validation result for 50000 features
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  35.1s


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   43.2s finished


[Pipeline] ........ (step 2 of 2) Processing classifier, total=  43.3s
the training and testing time is  78.73252630233765  seconds
accuracy 0.7769563310569513 greater then null accuracy 0.5044170164776643


validation result for 60000 features
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  35.2s


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   45.0s finished


[Pipeline] ........ (step 2 of 2) Processing classifier, total=  45.1s
the training and testing time is  80.72434663772583  seconds
accuracy 0.776079193033018 greater then null accuracy 0.5044170164776643


validation result for 70000 features
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  35.1s


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   46.3s finished


[Pipeline] ........ (step 2 of 2) Processing classifier, total=  46.4s
the training and testing time is  81.85373854637146  seconds
accuracy 0.7765177620449847 greater then null accuracy 0.5044170164776643


validation result for 80000 features
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  35.1s


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   50.7s finished


[Pipeline] ........ (step 2 of 2) Processing classifier, total=  50.7s
the training and testing time is  86.20836019515991  seconds
accuracy 0.7773322473529227 greater then null accuracy 0.5044170164776643


validation result for 90000 features
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  35.2s


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   54.5s finished


[Pipeline] ........ (step 2 of 2) Processing classifier, total=  54.6s
the training and testing time is  90.18562126159668  seconds
accuracy 0.7763924566129942 greater then null accuracy 0.5044170164776643


validation result for 100000 features
[Pipeline] ........ (step 1 of 2) Processing vectorizer, total=  35.4s


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   56.6s finished


[Pipeline] ........ (step 2 of 2) Processing classifier, total=  56.7s
the training and testing time is  92.42504501342773  seconds
accuracy 0.7770189837729465 greater then null accuracy 0.5044170164776643




**lets create a list of custom stopwords in this case we will add custom stopwords selected from top 10 features in the list**

In [10]:
df1 = pd.DataFrame(result_unigram_without_stopwords)
df1.to_csv('df2.csv')

In [11]:
df2 = pd.DataFrame(result)
df2.to_csv('df.csv')