In [31]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import math

# Load your dataset here
data = pd.read_csv('training.csv')
negativewords = pd.read_csv('negative-words.csv')
negativewords = negativewords.rename(columns={'2-faced': 'neg'})
negativewords['val'] = [-1 for _ in range(len(negativewords))]
positivewords = pd.read_csv('positive-words.csv')
positivewords = positivewords.rename(columns={'a+': 'pos'})
positivewords['val'] = [1 for _ in range(len(positivewords))]

In [32]:
import re

def parsing_words(s):
    words = re.findall(r'\b\w+\b', s.lower())
    val = []
    negative_words = set(negativewords['neg'].str.lower())
    positive_words = set(positivewords['pos'].str.lower())
    for word in words:
        if word in negative_words:
            val.append(-1)
        if word in positive_words:
            val.append(1)
    if len(val) != 0:
        return sum(val) / len(val)
    return 0

# 0 = sad
# 1 = joy
# 2 = love
# 3 = anger
# 4 = fear
# 5 = surprise

In [33]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MaxAbsScaler

# Initialize vectorizer for word counts, with some limitations to avoid memory issues
vectorizer = CountVectorizer(ngram_range=(1, 1), max_features=5000, min_df=5)

# Function to create additional features without converting to dense array
def create_features(df):
    # Text length and word count
    df['text_length'] = df['text'].apply(len)
    df['word_count'] = df['text'].apply(lambda x: len(x.split()))
    df['sentiment_score'] = df['text'].apply(parsing_words)
    
    # Transform text with vectorizer
    X_vect = vectorizer.fit_transform(df['text'])
    
    # Scale text_length and word_count
    scaler = MaxAbsScaler()
    additional_features = scaler.fit_transform(df[['text_length', 'word_count', 'sentiment_score']])
    
    # Combine the additional features with the vectorized text (all in sparse format)
    from scipy.sparse import hstack
    X_combined = hstack((X_vect, additional_features))
    
    return X_combined, vectorizer, scaler


In [35]:
from sklearn.ensemble import RandomForestClassifier
import joblib

# Assuming 'data' has a 'text' column containing the text and a 'label' column for the labels
y_train = data['label']

# Apply the feature creation to your training and validation data
X_train, vectorizer, scaler = create_features(data)

# Save the vectorizer and scaler
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(scaler, 'scaler.pkl')

# Proceed with training the RandomForestClassifier as before
# Initialize the classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the sparse matrix
clf.fit(X_train, y_train)

# Save the trained model
joblib.dump(clf, 'sentiment_classifier.pkl')



['sentiment_classifier.pkl']

In [36]:
from numpy import hstack
from sklearn.metrics import classification_report, accuracy_score


# Load the test data
test_data = pd.read_csv('test.csv')

from scipy.sparse import hstack, csr_matrix

# Load the trained classifier
clf = joblib.load('sentiment_classifier.pkl')

# Load the vectorizer and scaler
vectorizer = joblib.load('vectorizer.pkl')
scaler = joblib.load('scaler.pkl')

# Transform the test data using the loaded vectorizer and scaler
X_test_vect = vectorizer.transform(test_data['text'])  # vectorize the text

# Ensure text length and word count are calculated if not already
test_data['text_length'] = test_data['text'].apply(len)
test_data['word_count'] = test_data['text'].apply(lambda x: len(x.split()))
test_data['sentiment_score'] = test_data['text'].apply(parsing_words)

# Scale features and convert to a sparse format
features = scaler.transform(test_data[['text_length', 'word_count', 'sentiment_score']])
features_sparse = csr_matrix(features)  # Convert to sparse matrix

# Combine the additional features with the vectorized text (all in sparse format)
X_test_combined = hstack([X_test_vect, features_sparse])

# Predict using the classifier
y_pred = clf.predict(X_test_combined)

# Assuming 'label' is the column with actual labels in your test_data
y_test = test_data['label']

# Output the classification report
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Test Accuracy: 0.8605
              precision    recall  f1-score   support

           0       0.88      0.91      0.90       581
           1       0.86      0.90      0.88       695
           2       0.80      0.64      0.71       159
           3       0.89      0.85      0.87       275
           4       0.84      0.84      0.84       224
           5       0.71      0.64      0.67        66

    accuracy                           0.86      2000
   macro avg       0.83      0.80      0.81      2000
weighted avg       0.86      0.86      0.86      2000



In [42]:
# Optionally, display the results
test_data['predicted_label'] = y_pred
test_data.loc[:,['text', 'label', 'predicted_label']].head(25)

Unnamed: 0,text,label,predicted_label
0,im feeling rather rotten so im not very ambiti...,0,0
1,im updating my blog because i feel shitty,0,0
2,i never make her separate from me because i do...,0,0
3,i left with my bouquet of red and yellow tulip...,1,1
4,i was feeling a little vain when i did this one,0,0
5,i cant walk into a shop anywhere where i do no...,4,4
6,i felt anger when at the end of a telephone call,3,0
7,i explain why i clung to a relationship with a...,1,1
8,i like to have the same breathless feeling as ...,1,1
9,i jest i feel grumpy tired and pre menstrual w...,3,3
