In [42]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


In [11]:
df = pd.read_csv('data/train.csv')

In [12]:
df.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...


In [43]:
# Download necessary NLTK data files
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    
    # Remove mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Tokenize and remove stopwords
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    # Join tokens back to string
    return ' '.join(tokens)

# Apply preprocessing to the tweet column
df['cleaned_text'] = df['SentimentText'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jmattson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jmattson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [44]:
df.head()

Unnamed: 0,ItemID,Sentiment,SentimentText,cleaned_text
0,1,0,is so sad for my APL frie...,sad apl friend
1,2,0,I missed the New Moon trail...,miss new moon trailer
2,3,1,omg its already 7:30 :O,omg alreadi
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...,omgaga im sooo im gunna cri ive dentist sinc s...
4,5,0,i think mi bf is cheating on me!!! ...,think mi bf cheat tt


In [25]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)  # You can adjust max_features as needed

# Fit and transform the cleaned text
X = tfidf.fit_transform(df['cleaned_text'])
y = df['Sentiment']

# Display the shape of the feature matrix
print(X.shape)

(99989, 5000)


In [26]:
print(X)

  (0, 1700)	0.7232827010348476
  (0, 3709)	0.6905520504521983
  (1, 4486)	0.5991199483661498
  (1, 2816)	0.5732922558760698
  (1, 2915)	0.36067946812754204
  (1, 2780)	0.42696791225278546
  (2, 135)	0.6922262518507328
  (2, 3037)	0.7216805500002658
  (3, 2768)	0.30888050632533515
  (3, 3417)	0.2621695137500573
  (3, 1030)	0.4222846429977529
  (3, 1778)	0.1665514121950295
  (3, 3914)	0.26239551106556597
  (3, 1151)	0.3767515016072556
  (3, 2284)	0.21881271819685766
  (3, 1036)	0.2764076135148109
  (3, 1907)	0.36105584063838325
  (3, 4020)	0.29479133346442504
  (3, 2182)	0.2783159414536099
  (4, 4526)	0.4834703627443245
  (4, 761)	0.5241741686564342
  (4, 439)	0.4568026808859681
  (4, 2747)	0.47074441215050944
  (4, 4378)	0.24744465729869974
  (5, 2845)	0.5812207265603395
  :	:
  (99983, 3530)	0.2457139360888111
  (99983, 1778)	0.20797604860230548
  (99984, 10)	0.41315970113428263
  (99984, 3792)	0.4159078929477707
  (99984, 3362)	0.40453797019756643
  (99984, 2093)	0.29101622277702094
 

In [45]:

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.7510
Precision: 0.7507
Recall: 0.7510
F1-Score: 0.7483

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.65      0.70      8750
           1       0.75      0.83      0.79     11248

    accuracy                           0.75     19998
   macro avg       0.75      0.74      0.74     19998
weighted avg       0.75      0.75      0.75     19998



In [46]:
# Initialize Naive Bayes classifier
nb_clf = MultinomialNB()

# Train the classifier
nb_clf.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_nb = nb_clf.predict(X_test)
print(f"Naive Bayes Accuracy: {accuracy_score(y_test, y_pred_nb):.4f}")
print(classification_report(y_test, y_pred_nb))

Naive Bayes Accuracy: 0.7411
              precision    recall  f1-score   support

           0       0.75      0.61      0.67      8750
           1       0.73      0.84      0.79     11248

    accuracy                           0.74     19998
   macro avg       0.74      0.73      0.73     19998
weighted avg       0.74      0.74      0.74     19998



In [47]:
# Initialize SVM classifier
svm_clf = SVC()

# Train the classifier
svm_clf.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_svm = svm_clf.predict(X_test)
print(f"SVM Accuracy: {accuracy_score(y_test, y_pred_svm):.4f}")
print(classification_report(y_test, y_pred_svm))

SVM Accuracy: 0.7535
              precision    recall  f1-score   support

           0       0.76      0.64      0.70      8750
           1       0.75      0.84      0.79     11248

    accuracy                           0.75     19998
   macro avg       0.75      0.74      0.74     19998
weighted avg       0.75      0.75      0.75     19998



In [38]:
# Initialize Random Forest classifier
rf_clf = RandomForestClassifier(n_estimators=100)

# Train the classifier
rf_clf.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_rf = rf_clf.predict(X_test)
print(f"Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.7350
              precision    recall  f1-score   support

           0       0.71      0.66      0.69      8750
           1       0.75      0.79      0.77     11248

    accuracy                           0.74     19998
   macro avg       0.73      0.73      0.73     19998
weighted avg       0.73      0.74      0.73     19998



In [61]:
# Example of a new text field
new_text = "I love this new product."

# Preprocess the new text
cleaned_new_text = preprocess_text(new_text)

# Vectorize the new text
new_text_vectorized = tfidf.transform([cleaned_new_text])

# Predict the sentiment
predicted_sentiment = clf.predict(new_text_vectorized)

# Print the predicted sentiment
print(f"Predicted Sentiment: {predicted_sentiment[0]}")

Predicted Sentiment: 1
