In [45]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import math

# Load your dataset here
data = pd.read_csv('training.csv')
negativewords = pd.read_csv('negative-words.csv')
negativewords = negativewords.rename(columns={'2-faced': 'neg'})
negativewords['val'] = [-1 for _ in range(len(negativewords))]
positivewords = pd.read_csv('positive-words.csv')
positivewords = positivewords.rename(columns={'a+': 'pos'})
positivewords['val'] = [1 for _ in range(len(positivewords))]

In [46]:
import re

def parsing_words(s):
    words = re.findall(r'\b\w+\b', s.lower())
    val = []
    negative_words = set(negativewords['neg'].str.lower())
    positive_words = set(positivewords['pos'].str.lower())
    for word in words:
        if word in negative_words:
            val.append(-1)
        if word in positive_words:
            val.append(1)
    if len(val) != 0:
        return sum(val) / len(val)
    return 0

# 0 = sad
# 1 = joy
# 2 = love
# 3 = anger
# 4 = fear
# 5 = surprise

In [47]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MaxAbsScaler
from scipy.sparse import hstack
from scipy.sparse import csr_matrix

# Initialize vectorizer for word counts, with some limitations to avoid memory issues
# Function to create additional features without converting to dense array
def create_features(df):
    # Text length and word count
    df['text_length'] = df['text'].apply(len)
    df['word_count'] = df['text'].apply(lambda x: len(x.split()))
    df['sentiment_score'] = df['text'].apply(parsing_words)
    
    # Transform text with vectorizer
    vectorizer = CountVectorizer(ngram_range=(1, 1), max_features=5000, min_df=5)
    X_vect = vectorizer.fit_transform(df['text'])
    
    # Scale text_length and word_count
    scaler = MaxAbsScaler()
    scaled_features = scaler.fit_transform(df[['text_length', 'word_count']])
    
    all_features = np.hstack((scaled_features, df[['sentiment_score']].to_numpy()))
    all_features_sparse = csr_matrix(all_features)

    # Combine the additional features with the vectorized text (all in sparse format)
    X_combined = hstack((X_vect, all_features_sparse))
    
    return X_combined, vectorizer, scaler


In [48]:
from sklearn.ensemble import RandomForestClassifier
import joblib

# Assuming 'data' has a 'text' column containing the text and a 'label' column for the labels
y_train = data['label']

# Apply the feature creation to your training and validation data
X_train, vectorizer, scaler = create_features(data)

# Save the vectorizer and scaler
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(scaler, 'scaler.pkl')

# Proceed with training the RandomForestClassifier as before
# Initialize the classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the sparse matrix
clf.fit(X_train, y_train)

# Save the trained model
joblib.dump(clf, 'sentiment_classifier.pkl')


['sentiment_classifier.pkl']

In [55]:
from sklearn.model_selection import GridSearchCV

# Define a parameter grid to search over
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'class_weight': [None, 'balanced']
}

# Initialize GridSearchCV with the RandomForestClassifier and the parameter grid
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# Train the GridSearchCV model
grid_search.fit(X_train, y_train)

# Find the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

best_clf = grid_search.best_estimator_
joblib.dump(best_clf, 'sentiment_classifier_best.pkl')

Fitting 3 folds for each of 54 candidates, totalling 162 fits
Best parameters: {'class_weight': 'balanced', 'max_depth': 30, 'min_samples_split': 10, 'n_estimators': 100}


['sentiment_classifier_best.pkl']

In [59]:
from numpy import hstack
from sklearn.metrics import classification_report, accuracy_score


# Load the test data
valid_data = pd.read_csv('validation.csv')

from scipy.sparse import hstack, csr_matrix

# Load the trained classifier
clf = joblib.load('sentiment_classifier.pkl')

# Load the vectorizer and scaler
vectorizer = joblib.load('vectorizer.pkl')
scaler = joblib.load('scaler.pkl')

# Transform the test data using the loaded vectorizer and scaler
X_valid_vect = vectorizer.transform(valid_data['text'])  # vectorize the text

# Ensure text length and word count are calculated if not already
valid_data['text_length'] = valid_data['text'].apply(len)
valid_data['word_count'] = valid_data['text'].apply(lambda x: len(x.split()))
valid_data['sentiment_score'] = valid_data['text'].apply(parsing_words)

# Scale features and convert to a sparse format
valid_scaled_features = scaler.transform(valid_data[['text_length', 'word_count']])
valid_all_features = np.hstack((valid_scaled_features, valid_data[['sentiment_score']].to_numpy()))
valid_features_sparse = csr_matrix(valid_all_features) 

# Combine the additional features with the vectorized text (all in sparse format)
X_valid_combined = hstack([X_valid_vect, valid_features_sparse])

# Predict using the classifier
y_valid_pred = clf.predict(X_valid_combined)

# Assuming 'label' is the column with actual labels in your test_data
y_valid = valid_data['label']

# Output the classification report
print("Test Accuracy:", accuracy_score(y_valid, y_valid_pred))
print(classification_report(y_valid, y_valid_pred))


Test Accuracy: 0.864
              precision    recall  f1-score   support

           0       0.83      0.91      0.87       550
           1       0.88      0.91      0.90       704
           2       0.91      0.72      0.81       178
           3       0.89      0.81      0.85       275
           4       0.83      0.83      0.83       212
           5       0.87      0.77      0.82        81

    accuracy                           0.86      2000
   macro avg       0.87      0.82      0.84      2000
weighted avg       0.87      0.86      0.86      2000



In [58]:
from numpy import hstack
from sklearn.metrics import classification_report, accuracy_score


# Load the test data
test_data = pd.read_csv('test.csv')

from scipy.sparse import hstack, csr_matrix

# Load the trained classifier
clf = joblib.load('sentiment_classifier.pkl')

# Load the vectorizer and scaler
vectorizer = joblib.load('vectorizer.pkl')
scaler = joblib.load('scaler.pkl')

# Transform the test data using the loaded vectorizer and scaler
X_test_vect = vectorizer.transform(test_data['text'])  # vectorize the text

# Ensure text length and word count are calculated if not already
test_data['text_length'] = test_data['text'].apply(len)
test_data['word_count'] = test_data['text'].apply(lambda x: len(x.split()))
test_data['sentiment_score'] = test_data['text'].apply(parsing_words)

# Scale features and convert to a sparse format
scaled_features = scaler.transform(test_data[['text_length', 'word_count']])
all_features = np.hstack((scaled_features, test_data[['sentiment_score']].to_numpy()))

features_sparse = csr_matrix(all_features)  # Convert to sparse matrix

# Combine the additional features with the vectorized text (all in sparse format)
X_test_combined = hstack([X_test_vect, features_sparse])

# Predict using the classifier
y_pred = clf.predict(X_test_combined)

# Assuming 'label' is the column with actual labels in your test_data
y_test = test_data['label']

# Output the classification report
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Test Accuracy: 0.8605
              precision    recall  f1-score   support

           0       0.88      0.91      0.90       581
           1       0.86      0.90      0.88       695
           2       0.80      0.64      0.71       159
           3       0.89      0.85      0.87       275
           4       0.84      0.84      0.84       224
           5       0.71      0.64      0.67        66

    accuracy                           0.86      2000
   macro avg       0.83      0.80      0.81      2000
weighted avg       0.86      0.86      0.86      2000



In [43]:
# Optionally, display the results
test_data['predicted_label'] = y_pred
test_data.loc[:,['text', 'label', 'predicted_label']].head(25)

Unnamed: 0,text,label,predicted_label
0,im feeling rather rotten so im not very ambiti...,0,0
1,im updating my blog because i feel shitty,0,0
2,i never make her separate from me because i do...,0,0
3,i left with my bouquet of red and yellow tulip...,1,1
4,i was feeling a little vain when i did this one,0,0
...,...,...,...
95,im feeling angry at someone i do something tho...,3,3
96,i love neglecting this blog but sometimes i fe...,2,1
97,i lay in bed feeling as though i were awaiting...,0,1
98,i feel my heart is tortured by what i have done,3,4
