In [1]:
import pandas as pd
import numpy as np
import sqlite3
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix,accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns

def DB_Connection(db_name):
    ''' Returns a Database connection'''
    conn = sqlite3.connect(db_name)
    print("Connection to DB successfully")
    return conn
# Connect to database 
db_conn = DB_Connection('../../db.sqlite3')
cursor = db_conn.cursor()

for row in cursor.execute("SELECT name FROM sqlite_master WHERE type='table';"):
    print(row)


Connection to DB successfully
('twitter_sentiment',)
('django_migrations',)
('sqlite_sequence',)
('django_content_type',)
('auth_group_permissions',)
('auth_user_groups',)
('auth_user_user_permissions',)
('auth_permission',)
('auth_group',)
('auth_user',)
('django_session',)
('twitter_sentiment_test',)


In [2]:
# print the train data
train_df = pd.read_sql_query("SELECT * FROM twitter_sentiment", db_conn)
train_df.head()



Unnamed: 0,id,name,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [3]:
#print the test data

test_df = pd.read_sql_query("SELECT * FROM twitter_sentiment_test", db_conn) 
print(test_df.shape)
test_df.head()

(1000, 4)


Unnamed: 0,id,name,sentiment,text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [4]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
train_df = train_df[['sentiment', 'text']]

def preprocess_text(text):
     # Check if the text is not a string)
    if not isinstance(text, str):
        return ""
    
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split())
    
    return text

# Apply preprocessing to the tweet content
train_df['ProcessedTweet'] = train_df['text'].apply(preprocess_text)


# Vectorizing the text
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(train_df['ProcessedTweet'])
y = train_df['sentiment']



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\itsda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\itsda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:

# Logistic Regression with K-Fold Cross-Validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc_scores = []

for train_index, val_index in kfold.split(X, y):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Initialize LR model
    # Tune hyperparameters (C)
    model = LogisticRegression(max_iter=1000, C=5.0)  

    # Train the model
    model.fit(X_train, y_train)

    # Evaluate the model
    predictions = model.predict(X_val)
    acc = accuracy_score(y_val, predictions)
    acc_scores.append(acc)

# Average accuracy across all folds
average_accuracy = np.mean(acc_scores)
print(f"Average Accuracy: {average_accuracy}")

In [None]:
# Load and preprocess the test data
test_df = pd.read_sql_query("SELECT * FROM twitter_sentiment_test", db_conn)

test_df = test_df[['sentiment', 'text']]
test_df['ProcessedTweet'] = test_df['text'].apply(preprocess_text)

# Vectorize the test data
X_test = vectorizer.transform(test_df['ProcessedTweet'])
y_test = test_df['sentiment']

# Use the model to make predictions on the test set
predictions_test = model.predict(X_test)

# Evaluate the model on the test dataset
accuracy = accuracy_score(y_test, predictions_test)
print(f"Accuracy on Test Set: {accuracy}")
print("\nClassification Report on Test Set:\n")
print(classification_report(y_test, predictions_test))


In [None]:
# Plotting the Confusion Matrix
cm = confusion_matrix(y_test, predictions_test)
sns.heatmap(cm, annot=True, fmt="d")
plt.title('Confusion Matrix for Test Set')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
from lime.lime_text import LimeTextExplainer
def predict_proba(texts):
    # Preprocess and vectorize texts
    processed_texts = [preprocess_text(text) for text in texts]
    vectorized_texts = vectorizer.transform(processed_texts)
    return model.predict_proba(vectorized_texts)

# Create LIME Text Explainer
explainer = LimeTextExplainer(class_names=['Negative', 'Positive', 'Irrelevant', 'Neutral'])  

# Choose an index of texts for explanation
idx = 13  
text_instance = test_df['text'][idx]

# Generate explanation 
exp = explainer.explain_instance(text_instance, predict_proba, num_features=4)
exp.show_in_notebook(text=True)  



In [None]:
import joblib
model_info = {
    "model": model,
    "vectorizer": vectorizer,
}

joblib.dump(model_info, 'trained_model')

