In [1]:
import pandas as pd
import sys
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from wordcloud import WordCloud
from xgboost import XGBClassifier
from nltk.stem import SnowballStemmer

nltk.downloader.download('stopwords')
nltk.downloader.download('vader_lexicon')

def data_import(path):
    """
    Import data from a CSV file.
    """
    data = pd.read_csv(path, encoding='utf-8')
    data.sample(3)
    return data


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\heman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\heman\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:

def preprocess_data(data):
    """
    Preprocess data by converting to lowercase, removing non-alphabetic characters, and applying stemming.
    """
    data['Lyric'] = data['Lyric'].str.lower()  #LowerCasing
    data['Lyric'] = data['Lyric'].str.replace('[^a-zA-Z\s]', '')  #Special Character relacement
    data['Lyric'] = data['Lyric'].str.strip()  #Removing Unnecesary spaces
    data.dropna(inplace=True)  #Removing empty rows
    data.drop_duplicates(subset=['Title'], inplace=True)  #Drop Duplicate rows

    stemmer = SnowballStemmer('english')
    data['Lyric'] = data['Lyric'].apply(lambda x: ' '.join([stemmer.stem(word) for word in str(x).split()]) if pd.notnull(x) else '')
    return data


In [3]:

def feature_engineering(data):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(data['Lyric'])

    lda = LatentDirichletAllocation(n_components=200, random_state=42)
    lda.fit(tfidf_matrix)

    num_top_words = 200
    feature_names = tfidf_vectorizer.get_feature_names_out()
    topics = []

    for topic_idx, topic in enumerate(lda.components_):
        top_words_idx = topic.argsort()[:-num_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topics.append(f"Topic #{topic_idx + 1}: {' '.join(top_words)}")

    lda_topic_matrix = lda.transform(tfidf_matrix)
    data['topics'] = lda_topic_matrix.argmax(axis=1) + 1

    topic_counts = data['topics'].value_counts().sort_index()
    return topics, topic_counts


In [4]:
def sentiment_analysis(data):
    analyzer = SentimentIntensityAnalyzer()
    data['sentiment_score'] = data['Lyric'].apply(lambda x: analyzer.polarity_scores(x))
    data['sentiment_compound'] = data['sentiment_score'].apply(lambda x: x['compound'])

    def assign_mood(compound_score):
        positive_threshold = 0.2
        negative_threshold = -0.2
        if compound_score >= positive_threshold:
            return 1
        elif compound_score <= negative_threshold:
            return 2
        else:
            return 0

    data['Mood'] = data['sentiment_compound'].apply(assign_mood)
    #data.to_csv('./datasets/Combined_Data.csv', index=False)
    return data

In [5]:
def train_model(train_data):
    X_train = train_data[['topics', 'sentiment_compound']]
    y_train = train_data['Mood']

    model = XGBClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    return model

In [6]:
def make_predictions(model, test_data):
    X_test = test_data[['topics', 'sentiment_compound']]
    y_pred = model.predict(X_test)
    test_data['Predicted_Mood'] = y_pred
    return model, y_pred

In [7]:
def create_word_clouds(test_analysed_data, test_topics):
    for topic_idx, topic in enumerate(test_topics):
        try:
            topic_lyrics = test_analysed_data[test_analysed_data['topics'] == topic_idx + 1]['Lyric']
            text = ' '.join(topic_lyrics)
            wordcloud = WordCloud(width=700, height=300).generate(text)
            plt.figure(figsize=(10, 5))
            plt.title(f"Word Cloud for {topic_idx}")
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.axis('off')
            plt.show()
        except:
            continue

In [8]:
def plot_histogram(data, column_name):
    plt.figure(figsize=(7, 3))
    data[column_name].value_counts().sort_index().plot(kind='bar', color='skyblue')
    plt.title(f'Histogram for {column_name}')
    plt.xlabel(column_name)
    plt.ylabel('Count')
    plt.show()

In [9]:
path = './datasets/Training_Data.csv'
data = data_import(path)
preprocessed_data = preprocess_data(data)

# Train the model
trained_model = train_model(preprocessed_data)
print("Model Trained")


Model Trained


In [None]:
test_path = './datasets/mix.csv'
test_data = data_import(test_path)
print("Data Imported")
test_processed_data = preprocess_data(test_data)
print("Data Pre-Processed")
test_topics, test_topic_counts = feature_engineering(test_data)
print(test_topics)
print("Extracted Topics")
test_analysed_data = sentiment_analysis(test_data)
print("Data Analyzed")
model, y_pred = make_predictions(trained_model, test_data)
y_true = test_analysed_data['Mood']

accuracy = accuracy_score(y_true, y_pred)
confusion = confusion_matrix(y_true, y_pred)
report = classification_report(y_true, y_pred)

print("User input analyzed")
mood = test_analysed_data['Mood'].mode().values
if(mood == 1):
    Mood_alpha = "Happy"
elif(mood == 2):
    Mood_alpha = "Sad"
else:
    Mood_alpha = "Calm"
print("The predicted mood of the user is",Mood_alpha)
while True:
    print("\nPress 1 to get evaluation metrics")
    print("Press 2 to get word clouds for the songs")
    print("Press 3 to get a histogram for the data")
    choice = input("Enter Choice: ")    
    if choice == "1":
        #print("Predicted Data: ", test_analysed_data['Predicted_Mood'])
        print("Accuracy\n", accuracy)
        print("Confusion Matrix\n", confusion)
        print("Classification Report\n", report)
    elif choice == "2":
        create_word_clouds(test_analysed_data, test_topics)
    elif choice =="3":
        plot_histogram(test_analysed_data, 'Predicted_Mood')
    else:
        print("Thank You for Using TuneTherapy")
        break

Data Imported
Data Pre-Processed
['Topic #1: yee ha gone gonna good got gotta grand graviti grow guess guy habit exit hand happen hard hate head hear heard heart hell help gon goin girl gimm face fall far feel feelin fell fight final fine fli flower forget forgiv freak free friend fuck fuckin fun game gave hey hi hide kind kiss knew know la late laugh lead learn leav left lesson let lie life light like line lip listen littl live kinda just high jump higher hit hmm hold hole home honey hope hour hurt ice idea igg iggi iii imagin import insid isn jami jet eye everyth london break better big biggi bit bitch blow bodi bore bout boy breath everyon breathless bring broken burn came care caus chang chest chill bet best believ befor ah ahead ain anymor anywh apart ariana arichan ask away ayi azalea babe babi bad bae band bare beauti becom bed circl clear click design dice did didn die differ doe doin don dont door drive dumb earth edg eh el em en end energi everi diamond deni clock deeper clos