In [None]:
# Part 2
import pandas as pd
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize
from sklearn.decomposition import PCA

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# Load the CSV file
df = pd.read_csv("rick_n_morty.csv")

# Preprocessing: Aggregate dialogue per episode and calculate word count
episode_dialogue = df.groupby(['Season No', 'episode no.'])['dialouge'].apply(' '.join).reset_index()
episode_scores = df[['Season No', 'episode no.', 'IMDb score ']].drop_duplicates()
episode_data = pd.merge(episode_dialogue, episode_scores, on=['Season No', 'episode no.'], how='left')


In [None]:
# Normalize Text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def normalize_text(text):
    words = re.findall(r'\w+', text.lower())
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

episode_data['normalized_dialogue'] = episode_data['dialouge'].apply(normalize_text)


In [None]:
# Calculate word count
episode_data['word_count'] = episode_data['normalized_dialogue'].apply(lambda x: len(x.split()))


In [None]:
# TF-IDF Vectorization
vectorizer_tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
tfidf_matrix = vectorizer_tfidf.fit_transform(episode_data['normalized_dialogue'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer_tfidf.get_feature_names_out())
episode_data = pd.concat([episode_data, tfidf_df], axis=1)



In [None]:
# Word2Vec Embeddings
sentences = [doc.split() for doc in episode_data['normalized_dialogue']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)

def get_episode_embedding(episode_text):
    words = episode_text.split()
    vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(100)

embeddings = np.vstack(episode_data['normalized_dialogue'].apply(get_episode_embedding))
embedding_df = pd.DataFrame(embeddings, columns=[f'embedding_{i}' for i in range(100)])



In [None]:
# PCA for Word2Vec Embeddings
n_components = min(embedding_df.shape[0], embedding_df.shape[1])
pca = PCA(n_components=n_components)
pca_embeddings = pca.fit_transform(embedding_df)
pca_embedding_df = pd.DataFrame(pca_embeddings, columns=[f'pca_embedding_{i}' for i in range(n_components)])
episode_data = pd.concat([episode_data, pca_embedding_df], axis=1)


In [None]:
# Sentiment Analysis
analyzer = SentimentIntensityAnalyzer()
episode_data['sentiment_compound'] = episode_data['dialouge'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
episode_data['sentiment_variance'] = episode_data['dialouge'].apply(lambda x: np.var([analyzer.polarity_scores(s)['compound'] for s in sent_tokenize(x)]))


In [None]:
# Prepare features and target (EXCLUDE scene_changes)
features = episode_data.drop(['Season No', 'episode no.', 'dialouge', 'IMDb score ', 'normalized_dialogue'], axis=1)
target = episode_data['IMDb score ']


In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)


In [None]:
print(X_train.dtypes)

word_count              int64
10                    float64
137                   float64
20                    float64
22                    float64
                       ...   
pca_embedding_36      float32
pca_embedding_37      float32
pca_embedding_38      float32
sentiment_compound    float64
sentiment_variance    float64
Length: 1042, dtype: object


In [None]:
# Gradient Boosting Regression
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)
gb_predictions = gb_model.predict(X_test)

In [None]:
# Evaluate model
def evaluate_model(predictions, actual):
    rmse = np.sqrt(mean_squared_error(actual, predictions))
    r2 = r2_score(actual, predictions)
    mae = mean_absolute_error(actual, predictions)
    return rmse, r2, mae

gb_rmse, gb_r2, gb_mae = evaluate_model(gb_predictions, y_test)
print("Gradient Boosting: RMSE={}, R2={}, MAE={}".format(gb_rmse, gb_r2, gb_mae))


Gradient Boosting: RMSE=1.0693835641941645, R2=-0.05459938431687772, MAE=0.9623657388185116


In [None]:
# Feature Importance
feature_importance = gb_model.feature_importances_
feature_names = X_train.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print("\nFeature Importance:")
print(importance_df.head(20))



Feature Importance:
               Feature  Importance
206               dumb    0.407257
890               told    0.085949
32              answer    0.082127
16               ahead    0.076455
970                win    0.050425
781               sigh    0.022493
918                 uh    0.022348
25             alright    0.016301
974              woman    0.016074
1005   pca_embedding_4    0.014035
961           whatever    0.012492
1036  pca_embedding_35    0.012450
1025  pca_embedding_24    0.010495
158              crazy    0.009721
238                eye    0.009483
164                cry    0.008450
676               pull    0.008231
624            outside    0.008022
989               yeah    0.007603
224               ever    0.006864


In [None]:
# Correlation Matrix
correlation_matrix = features.corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)


Correlation Matrix:
                    word_count        10       137        20        22  \
word_count            1.000000  0.107349  0.010049  0.091375  0.066154   
10                    0.107349  1.000000 -0.027661  0.229401  0.053532   
137                   0.010049 -0.027661  1.000000 -0.029378 -0.051897   
20                    0.091375  0.229401 -0.029378  1.000000 -0.015271   
22                    0.066154  0.053532 -0.051897 -0.015271  1.000000   
...                        ...       ...       ...       ...       ...   
pca_embedding_36     -0.419449  0.086669  0.057044  0.171725 -0.194125   
pca_embedding_37     -0.086487 -0.109433  0.037969  0.376266 -0.152985   
pca_embedding_38      0.027456  0.182810 -0.059788 -0.083147  0.308151   
sentiment_compound    0.158783 -0.077784  0.171079  0.016670  0.112056   
sentiment_variance    0.128563 -0.084297  0.133971 -0.028406  0.156967   

                         aah      able      acid    across  actually  ...  \
word_count   