# NLP Spotify Reviews Text Classification 

In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [8]:
df = pd.read_csv('spotify_reviews.csv')
df.head()

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,reviewCreatedVersion,at,appVersion
0,437314fe-1b1d-4352-abea-12fec30fce58,Rajib Das,It's good,4,0,,2024-05-09 16:28:13,
1,4933ad2c-c70a-4a84-957d-d405439b2e0f,Mihaela Claudia Neagu,"I love this app so much, I've been using Spoti...",5,0,8.9.38.494,2024-05-09 16:27:18,8.9.38.494
2,1ab275fb-59bf-42c7-88ef-b85901f0445e,JONATHAN GRACIA,Perfect,5,0,8.9.36.616,2024-05-09 16:27:03,8.9.36.616
3,b38406eb-7b11-4ceb-a45c-d7f28fb5d382,Cam Rempel,Best all around music streaming app I have use...,5,0,8.9.38.494,2024-05-09 16:26:19,8.9.38.494
4,7be7999d-4cb6-47b9-8414-d7bdaa9df578,Your clowness (Her Clowness),Are y'all fr gatekeeping the play button on so...,1,0,8.9.38.494,2024-05-09 16:26:14,8.9.38.494


In [11]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower().strip()
    return text

In [12]:
df['cleaned_content'] = df['content'].apply(clean_text)
df[['content', 'cleaned_content']].head()

Unnamed: 0,content,cleaned_content
0,It's good,its good
1,"I love this app so much, I've been using Spoti...",i love this app so much ive been using spotify...
2,Perfect,perfect
3,Best all around music streaming app I have use...,best all around music streaming app i have use...
4,Are y'all fr gatekeeping the play button on so...,are yall fr gatekeeping the play button on songs


In [17]:
X = df['cleaned_content']
y = df['score'].apply(lambda x: 'positive' if x > 3 else 'negative' if x < 3 else 'neutral')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

In [14]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.fit_transform(X_test)

In [15]:
model = LogisticRegression(max_iter=300)
model.fit(X_train_tfidf, y_train)

In [18]:
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.59      0.84      0.69      9733
     neutral       0.50      0.00      0.00      2471
    positive       0.34      0.22      0.27      4629

    accuracy                           0.55     16833
   macro avg       0.48      0.35      0.32     16833
weighted avg       0.51      0.55      0.47     16833



In [19]:
def predict_sentiment(review):
    cleaned_review = clean_text(review)
    vectorized_review = vectorizer.transform([cleaned_review])
    sentiment = model.predict(vectorized_review)[0]
    return sentiment

In [31]:
example_review = "The app is bad!!!!"
print(predict_sentiment(example_review))

negative
