In [6]:
import logging
def extract_features(df,field,training_data,testing_data,type="binary"):
    """Extract features using different methods"""
    
    logging.info("Extracting features and creating vocabulary...")
    
    if "binary" in type:
        
        # BINARY FEATURE REPRESENTATION
        cv= CountVectorizer(binary=True, max_df=0.95)
        cv.fit_transform(training_data[field].values)
        
        train_feature_set=cv.transform(training_data[field].values)
        test_feature_set=cv.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,cv
  
    elif "counts" in type:
        
        # COUNT BASED FEATURE REPRESENTATION
        cv= CountVectorizer(binary=False, max_df=0.95)
        cv.fit_transform(training_data[field].values)
        
        train_feature_set=cv.transform(training_data[field].values)
        test_feature_set=cv.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,cv
    
    else:    
        
        # TF-IDF BASED FEATURE REPRESENTATION
        tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95)
        tfidf_vectorizer.fit_transform(training_data[field].values)
        
        train_feature_set=tfidf_vectorizer.transform(training_data[field].values)
        test_feature_set=tfidf_vectorizer.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,tfidf_vectorizer

In [14]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import logging

# Read the csv file into a pandas DataFrame
df= pd.read_csv('../Output/tweets1.csv', encoding = "ISO-8859-1")
# Drop Null values from data frame
df= df.dropna(subset=['Tokenized'])

# GET A TRAIN TEST SPLIT (set seed for consistent results)
training_data, testing_data = train_test_split(df, random_state = 2000)

# Assign X (data) and y (target)

### Get features
X_train,X_test,feature_transformer=extract_features(df,'Tokenized',training_data,testing_data,type='tfidf')
### Get Labels
y_train = training_data["Sentiment"].values
y_test = testing_data["Sentiment"].values
print(y_train)


[1 0 0 ... 0 0 0]


In [12]:
# Normalizing continuous variables (don't need to run for this exercise as data not continuous)

from sklearn.preprocessing import MaxAbsScaler

transformer = MaxAbsScaler().fit(X_train)
MaxAbsScaler(copy=True)
transformer.transform(X_train)

<72777x13661 sparse matrix of type '<class 'numpy.float64'>'
	with 377092 stored elements in Compressed Sparse Row format>

In [15]:
# INIT LOGISTIC REGRESSION CLASSIFIER
logging.info("Training a Logistic Regression Model...")
scikit_log_reg = LogisticRegression(verbose=2, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
model=scikit_log_reg.fit(X_train,y_train)
print('number of features',scikit_log_reg.coef_.shape)
predictions = model.predict(X_test)
print(predictions)
results=pd.DataFrame({"Prediction": predictions, "Actual": y_test})
results.head(20)

[LibLinear]number of features (1, 13661)
[1 1 0 ... 1 0 1]


Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,0,0
3,1,1
4,0,0
5,1,1
6,1,1
7,1,1
8,0,1
9,0,0


In [16]:
# Calculate classification report
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print('accuracy:',accuracy_score(y_test, predictions))

[[ 6805  3912]
 [ 2792 10750]]
              precision    recall  f1-score   support

           0       0.71      0.63      0.67     10717
           1       0.73      0.79      0.76     13542

   micro avg       0.72      0.72      0.72     24259
   macro avg       0.72      0.71      0.72     24259
weighted avg       0.72      0.72      0.72     24259

accuracy: 0.7236489550270003
