In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from transformers import BertTokenizer, BertModel
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
# Load dataset
data = pd.read_csv('script_added.csv')
data = data.head(100)

In [14]:
data

Unnamed: 0,averageRating,numVotes,imdbid,title,budget,metascore,genres,year,script
0,5.7,57993,2076822,Chaos Walking,,-1,"Adventure, Sci-Fi",2021,S.w.a.t.\n\nby\nGeorge Huang\n\nBased on th: T...
1,7.3,99500,2934286,Halo,,-1,"Action, Sci-Fi",2021,\n\n\n \n\nPage not found | Bobcat Press\n\n\...
2,7.1,40996,10095582,Macbeth,,-1,"Drama, History, War",2021,FOR YOUR CONSIDERATION\n\nMACBETH\n\nBEST ADAP...
3,6.6,38757,9130508,Cherry,,-1,Drama,2021,\n\n\n\nFADE IN:\n\n\nFADE IN:\nBLACK AND WHIT...
4,7.3,613111,6723592,Tenet,"$205,000,000 (estimated)",69,"Action, Sci-Fi",2020,TENET\n\nWritten by\n\nChristopher Nolan\nORC...
...,...,...,...,...,...,...,...,...,...
95,6.9,261165,2704998,Game Night,"$37,000,000 (estimated)",66,"Action, Comedy, Crime, Thriller",2018,GAME NIGHT\nBy\nMark Perez\n\nRevisions By\nDa...
96,6.6,398458,2737304,Bird Box,,51,"Horror, Sci-Fi",2018,BIRD BOX\n\nScreenplay by\n\nEric Heisserer\n\...
97,6.8,371674,2798920,Annihilation,,79,"Adventure, Drama, Horror, Mystery, Sci-Fi, Thr...",2018,ANNIHILATION\n\nAlex Garland V.10\nOPEN ON -\...
98,6.7,50384,2837574,The Old Man & the Gun,,80,"Biography, Comedy, Crime, Drama, Romance",2018,FOR YOUR CONSIDERATION\n\nOb En\nra wt GUN\n\n...


In [4]:
#data = pd.read_csv('plot_added.csv')

In [5]:
# Show sample data
data.shape

(100, 9)

In [15]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def replace_text(input_string):
    text_to_replace = "Script provided for educational purposes. More scripts can be found here: http://www.sellingyourscreenplay.com/library"
    return input_string.replace(text_to_replace, "")

# Preprocess text: clean, remove stopwords, lemmatize
def clean_and_lemmatize(text):
    text = remove_html_js(text)
    text = replace_text(text)
    # Remove special characters and lowercase
    text = re.sub(r'[^A-Za-z\s]', '', text.lower())
    # Tokenize and remove stopwords, then lemmatize
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

def remove_special_sound_effects(text):
    # Define regex to match words with more than 3 repeated characters (e.g., "beeeez", "aaahhh")
    repeated_char_pattern = re.compile(r'(.)\1{2,}')  # Matches words with 3 or more repeating characters
    
    # Tokenize the text
    words = text.split()
    
    # Filter out words with repeated characters and sound effects
    filtered_words = [word for word in words if not repeated_char_pattern.search(word)]
    
    return ' '.join(filtered_words)

In [16]:
def remove_html_js(text):
    # Remove HTML tags using regex
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags (anything between < >)
    
    # Remove JavaScript code using regex (match common patterns like 'window.addEventListener' or '__wm.init')
    text = re.sub(r'window\.addEventListener\(.*?\);', '', text)  # Remove specific JS patterns
    text = re.sub(r'__wm\.[a-zA-Z0-9_]+\(.*?\);', '', text)  # Remove __wm JavaScript functions
    text = re.sub(r'javascript:.*?;', '', text)  # Remove javascript: links
    text = re.sub(r'var [a-zA-Z0-9_]+ = .*?;', '', text)  # Remove variable assignments in JS

    return text

In [17]:
example_script = "He screamed 'aaaaaaaahhh' and then said 'zzzzoooommmm' while walking to the zoo!"
cleaned_script = clean_and_lemmatize(example_script)
print(cleaned_script)

screamed aaaaaaaahhh said zzzzoooommmm walking zoo


In [18]:
# Apply preprocessing to the scripts
data['cleaned_script'] = data['script'].apply(clean_and_lemmatize)

In [19]:
# Apply preprocessing to the scripts
data['cleaned_script'] = data['cleaned_script'].apply(remove_special_sound_effects)

In [20]:
data['cleaned_script'].head()

0    swat george huang based th television series n...
1    page found bobcat press windowwpemojisettings ...
2    consideration macbeth best adapted screenplay ...
3    fade fade black whitedreamy cu serene face bea...
4    tenet written christopher nolan orchestra tuni...
Name: cleaned_script, dtype: object

In [21]:
data

Unnamed: 0,averageRating,numVotes,imdbid,title,budget,metascore,genres,year,script,cleaned_script
0,5.7,57993,2076822,Chaos Walking,,-1,"Adventure, Sci-Fi",2021,S.w.a.t.\n\nby\nGeorge Huang\n\nBased on th: T...,swat george huang based th television series n...
1,7.3,99500,2934286,Halo,,-1,"Action, Sci-Fi",2021,\n\n\n \n\nPage not found | Bobcat Press\n\n\...,page found bobcat press windowwpemojisettings ...
2,7.1,40996,10095582,Macbeth,,-1,"Drama, History, War",2021,FOR YOUR CONSIDERATION\n\nMACBETH\n\nBEST ADAP...,consideration macbeth best adapted screenplay ...
3,6.6,38757,9130508,Cherry,,-1,Drama,2021,\n\n\n\nFADE IN:\n\n\nFADE IN:\nBLACK AND WHIT...,fade fade black whitedreamy cu serene face bea...
4,7.3,613111,6723592,Tenet,"$205,000,000 (estimated)",69,"Action, Sci-Fi",2020,TENET\n\nWritten by\n\nChristopher Nolan\nORC...,tenet written christopher nolan orchestra tuni...
...,...,...,...,...,...,...,...,...,...,...
95,6.9,261165,2704998,Game Night,"$37,000,000 (estimated)",66,"Action, Comedy, Crime, Thriller",2018,GAME NIGHT\nBy\nMark Perez\n\nRevisions By\nDa...,game night mark perez revision dana fox kather...
96,6.6,398458,2737304,Bird Box,,51,"Horror, Sci-Fi",2018,BIRD BOX\n\nScreenplay by\n\nEric Heisserer\n\...,bird box screenplay eric heisserer based novel...
97,6.8,371674,2798920,Annihilation,,79,"Adventure, Drama, Horror, Mystery, Sci-Fi, Thr...",2018,ANNIHILATION\n\nAlex Garland V.10\nOPEN ON -\...,annihilation alex garland v open ext outer spa...
98,6.7,50384,2837574,The Old Man & the Gun,,80,"Biography, Comedy, Crime, Drama, Romance",2018,FOR YOUR CONSIDERATION\n\nOb En\nra wt GUN\n\n...,consideration ob en ra wt gun written david lo...


In [22]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

# Generate Bag of Words representation
bow_matrix = vectorizer.fit_transform(data['cleaned_script'])

# Convert sparse matrix to dense format (optional, but useful for storage or visualization)
bow_dense = bow_matrix.toarray()

# Get feature names (words in the vocabulary)
feature_names = vectorizer.get_feature_names_out()

# Add BoW embeddings to the DataFrame (optional)
bow_df = pd.DataFrame(bow_dense, columns=feature_names)
data = pd.concat([data.reset_index(drop=True), bow_df.reset_index(drop=True)], axis=1)

# View BoW embeddings
#print("Bag of Words Matrix:\n", bow_dense)
for word in feature_names:
    print(word)


aa
aac
aace
aafter
aah
aahh
aails
aakonian
aamo
aaooww
aap
aaq
aaron
aaronpoboy
aarp
aarth
aaskvarian
aatually
ab
aback
abajo
abandon
abandoned
abandoning
abandonment
abap
abate
abating
abba
abbadoo
abbas
abbey
abbia
abbiamo
abbie
abc
abdicate
abdicated
abdomen
abdominal
abduct
abducted
abducting
abduction
abductor
abe
abead
abefore
abeggin
abend
abercrombie
aberdeen
aberration
abes
abetting
abide
abies
abigail
abilene
ability
abin
abit
abject
abjoct
abk
abla
ablaze
able
ablebodied
ablowin
abm
abney
abnormal
abnormality
abnormally
aboard
abode
abolitionist
abominable
abomination
aboriginal
abort
aborted
aborting
abortion
abou
abound
aboutface
abouts
abovebehind
aboveboard
abovehe
abovethe
abr
abra
abracadabra
abraham
abrams
abrar
abras
abrasive
abrazo
abreast
abreastof
abroad
abroadsword
abrupt
abruptly
abruptness
abrutaland
absconding
absence
absent
absentee
absentia
absently
absentminded
absentmindedly
abshire
absinthe
absolutamente
absolute
absolutechaos
absolutely
absolution
absolv

In [23]:
import gensim
from gensim.models import Word2Vec

# Tokenize the scripts for Word2Vec
tokenized_scripts = [script.split() for script in data['cleaned_script']]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_scripts, vector_size=100, window=5, min_count=1, workers=4)
#word2vec_model.save("word2vec.model")

# Generate embeddings for each script
def get_word2vec_embedding(script):
    words = script.split()
    embeddings = np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv], axis=0)
    return embeddings

data['word2vec_embeddings'] = data['cleaned_script'].apply(get_word2vec_embedding)

In [24]:
data

Unnamed: 0,averageRating,numVotes,imdbid,title,budget,metascore,genres,year,script,cleaned_script,...,zurich,zuriick,zwigott,zwolferkoph,zxcvbn,zychlin,zygmunt,zygmunts,zz,word2vec_embeddings
0,5.7,57993,2076822,Chaos Walking,,-1,"Adventure, Sci-Fi",2021,S.w.a.t.\n\nby\nGeorge Huang\n\nBased on th: T...,swat george huang based th television series n...,...,0,0,0,0,0,0,0,0,0,"[0.059103835, 0.453089, 0.26529384, -0.4176544..."
1,7.3,99500,2934286,Halo,,-1,"Action, Sci-Fi",2021,\n\n\n \n\nPage not found | Bobcat Press\n\n\...,page found bobcat press windowwpemojisettings ...,...,0,0,0,0,0,0,0,0,0,"[-0.41574952, 0.42408133, -0.023088407, -0.013..."
2,7.1,40996,10095582,Macbeth,,-1,"Drama, History, War",2021,FOR YOUR CONSIDERATION\n\nMACBETH\n\nBEST ADAP...,consideration macbeth best adapted screenplay ...,...,0,0,0,0,0,0,0,0,0,"[0.006053833, 0.4267924, 0.5569145, -0.155583,..."
3,6.6,38757,9130508,Cherry,,-1,Drama,2021,\n\n\n\nFADE IN:\n\n\nFADE IN:\nBLACK AND WHIT...,fade fade black whitedreamy cu serene face bea...,...,0,0,0,0,0,0,0,0,0,"[0.017710354, 0.34117308, 0.32809997, -0.21164..."
4,7.3,613111,6723592,Tenet,"$205,000,000 (estimated)",69,"Action, Sci-Fi",2020,TENET\n\nWritten by\n\nChristopher Nolan\nORC...,tenet written christopher nolan orchestra tuni...,...,0,0,0,0,0,0,0,0,0,"[-0.037483852, 0.35277635, 0.3833807, -0.36528..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,6.9,261165,2704998,Game Night,"$37,000,000 (estimated)",66,"Action, Comedy, Crime, Thriller",2018,GAME NIGHT\nBy\nMark Perez\n\nRevisions By\nDa...,game night mark perez revision dana fox kather...,...,0,0,0,0,0,0,0,0,0,"[0.06828939, 0.36810827, 0.29484424, -0.266694..."
96,6.6,398458,2737304,Bird Box,,51,"Horror, Sci-Fi",2018,BIRD BOX\n\nScreenplay by\n\nEric Heisserer\n\...,bird box screenplay eric heisserer based novel...,...,0,0,0,0,0,0,0,0,0,"[0.025673937, 0.33018163, 0.49858218, -0.28732..."
97,6.8,371674,2798920,Annihilation,,79,"Adventure, Drama, Horror, Mystery, Sci-Fi, Thr...",2018,ANNIHILATION\n\nAlex Garland V.10\nOPEN ON -\...,annihilation alex garland v open ext outer spa...,...,0,0,0,0,0,0,0,0,0,"[-0.0214191, 0.38259202, 0.43802023, -0.179747..."
98,6.7,50384,2837574,The Old Man & the Gun,,80,"Biography, Comedy, Crime, Drama, Romance",2018,FOR YOUR CONSIDERATION\n\nOb En\nra wt GUN\n\n...,consideration ob en ra wt gun written david lo...,...,0,0,0,0,0,0,0,0,0,"[-0.0116701955, 0.37383407, 0.27485764, -0.395..."


In [19]:
# Load pre-trained GloVe embeddings (download from https://nlp.stanford.edu/projects/glove/)
glove_model = {}
with open('glove.6B/glove.6B.300d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_model[word] = vector

def get_glove_embedding(script):
    words = script.split()
    embeddings = np.mean([glove_model[word] for word in words if word in glove_model], axis=0)
    return embeddings

data['glove_embeddings'] = data['cleaned_script'].apply(get_glove_embedding)


In [143]:
data

Unnamed: 0,averageRating,numVotes,imdbid,title,budget,metascore,genres,year,plot,cleaned_script,word2vec_embeddings,glove_embeddings
0,7.1,44,5353,Four Feathers,,-1,Drama,1915,The story opens at General Feversham's residen...,story open general fevershams residence annual...,"[-0.10840075, 0.343514, 0.32318875, -0.0208891...","[-0.06624054, 0.045264885, 0.19010359, -0.1934..."
1,6.6,34,8041,The Hand That Rocks the Cradle,,-1,Drama,1917,A doctor's wife is the head of a bureau that p...,doctor wife head bureau publishes hand literat...,"[-0.07975083, 0.42583975, 0.28818783, -0.00507...","[0.12547019, -0.19894314, 0.18919533, -0.11963..."
2,6.7,901,12512,Now or Never,,-1,"Comedy, Short",1921,"A young man, unaccustomed to children, must ac...",young man unaccustomed child must accompany yo...,"[-0.10179423, 0.285363, 0.40394267, -0.0179752...","[0.13761406, 0.06498418, 0.24305935, -0.168516..."
3,7.8,109429,13442,Nosferatu,,-1,"Fantasy, Horror",1922,Vampire Count Orlok expresses interest in a ne...,vampire count orlok express interest new resid...,"[-0.09712987, 0.36584568, 0.2831459, -0.031721...","[-0.06036063, 0.006631947, 0.25249788, -0.0272..."
4,7.9,62497,15648,Battleship Potemkin,,97,"Drama, History, Thriller, War",1925,In the midst of the Russian Revolution of 1905...,midst russian revolution crew battleship potem...,"[-0.094044834, 0.3660296, 0.25973913, -0.01659...","[-0.04825757, 0.1778998, 0.19484283, -0.142914..."
...,...,...,...,...,...,...,...,...,...,...,...,...
2616,7.4,192412,9484998,Palm Springs,,83,"Comedy, Fantasy, Mystery, Romance",2020,When carefree Nyles and reluctant maid of hono...,carefree nyles reluctant maid honor sarah chan...,"[-0.088264726, 0.28092593, 0.24260172, -0.0152...","[-0.08915424, 0.07818702, 0.10188786, -0.15144..."
2617,7.1,15,9489732,Deep Sky Eye,,-1,Documentary,2018,"Meet Tim Doucette, a blind astronomer who buil...",meet tim doucette blind astronomer built deep ...,"[-0.05606736, 0.26449114, 0.29533, 0.007104733...","[-0.16272986, 0.0587912, 0.24682364, -0.146621..."
2618,8.7,101762,9544034,The Family Man,,-1,"Action, Comedy, Drama",2019,A working man from the National Investigation ...,working man national investigation agency try ...,"[-0.09479523, 0.4233595, 0.37675542, 0.0443046...","[-0.019234894, 0.04327571, 0.2373301, -0.21052..."
2619,5.1,380,9595506,So Pretty,,-1,Drama,2019,Four queer and trans youth in New York City st...,four queer trans youth new york city struggle ...,"[-0.12584709, 0.6850838, 0.21466684, -0.246995...","[0.26229557, 0.23553818, 0.16866894, -0.017909..."


In [25]:
# Load pre-trained FastText model
import fasttext
fasttext_model = fasttext.load_model('cc.en.300.bin/cc.en.300.bin')

def get_fasttext_embedding(script):
    words = script.split()
    embeddings = np.mean([fasttext_model.get_word_vector(word) for word in words], axis=0)
    return embeddings

data['fasttext_embeddings'] = data['cleaned_script'].apply(get_fasttext_embedding)



In [20]:
# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embedding(script):
    inputs = tokenizer(script, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = bert_model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return embeddings.flatten()

data['bert_embeddings'] = data['cleaned_script'].apply(get_bert_embedding)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [23]:
data

Unnamed: 0,averageRating,numVotes,imdbid,title,budget,metascore,genres,year,script,cleaned_script,word2vec_embeddings,glove_embeddings,fasttext_embeddings
0,5.7,57993,2076822,Chaos Walking,,-1,"Adventure, Sci-Fi",2021,S.w.a.t.\n\nby\nGeorge Huang\n\nBased on th: T...,swat george huang based th television series n...,"[-0.046969388, 0.46589696, 0.07740624, -0.1887...","[-0.039170023, 0.038690332, -0.0673106, -0.058...","[0.0021861363, -0.0123300925, -0.0025244486, 0..."
1,7.3,99500,2934286,Halo,,-1,"Action, Sci-Fi",2021,\n\n\n \n\nPage not found | Bobcat Press\n\n\...,page found bobcat press windowwpemojisettings ...,"[-0.24507631, 0.7540548, -0.46853718, -0.45219...","[-0.040147495, -0.013314822, -0.0018307532, -0...","[0.047132026, -0.07452622, -0.060340513, 0.037..."
2,7.1,40996,10095582,Macbeth,,-1,"Drama, History, War",2021,FOR YOUR CONSIDERATION\n\nMACBETH\n\nBEST ADAP...,consideration macbeth best adapted screenplay ...,"[-0.097308226, 0.4741566, 0.30222332, 0.046546...","[-0.077748224, -0.0041216365, -0.020653853, -0...","[-0.0044389307, -0.0051751067, 0.0098700635, 0..."
3,6.6,38757,9130508,Cherry,,-1,Drama,2021,\n\n\n\nFADE IN:\n\n\nFADE IN:\nBLACK AND WHIT...,fade fade black whitedreamy cu serene face bea...,"[-0.10989917, 0.362576, 0.14586078, 0.02407363...","[-0.08178206, -0.018020254, -0.0644763, -0.018...","[0.01082601, -0.007596205, -0.005855361, 0.077..."
4,7.3,613111,6723592,Tenet,"$205,000,000 (estimated)",69,"Action, Sci-Fi",2020,TENET\n\nWritten by\n\nChristopher Nolan\nORC...,tenet written christopher nolan orchestra tuni...,"[-0.15020654, 0.34708077, 0.21682572, -0.13185...","[-0.058570758, 0.00037278762, -0.018962432, -0...","[-0.006108128, -0.0022590724, 0.008155542, 0.0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,6.9,261165,2704998,Game Night,"$37,000,000 (estimated)",66,"Action, Comedy, Crime, Thriller",2018,GAME NIGHT\nBy\nMark Perez\n\nRevisions By\nDa...,game night mark perez revision dana fox kather...,"[-0.08510605, 0.38281652, 0.10046724, 0.004455...","[-0.044001076, 0.045774885, -0.0729081, -0.036...","[0.0074814074, -0.010142077, 0.004715462, 0.08..."
96,6.6,398458,2737304,Bird Box,,51,"Horror, Sci-Fi",2018,BIRD BOX\n\nScreenplay by\n\nEric Heisserer\n\...,bird box screenplay eric heisserer based novel...,"[-0.11732296, 0.34727016, 0.2725017, 0.0027993...","[-0.07158886, 0.0102483565, -0.08668974, -0.08...","[0.0028788687, -0.0011464348, 0.019387014, 0.0..."
97,6.8,371674,2798920,Annihilation,,79,"Adventure, Drama, Horror, Mystery, Sci-Fi, Thr...",2018,ANNIHILATION\n\nAlex Garland V.10\nOPEN ON -\...,annihilation alex garland v open ext outer spa...,"[-0.11785787, 0.41884735, 0.20533733, 0.033103...","[-0.07238136, 0.005471114, -0.060680687, -0.07...","[0.0031235514, -0.0097697545, 0.0044839787, 0...."
98,6.7,50384,2837574,The Old Man & the Gun,,80,"Biography, Comedy, Crime, Drama, Romance",2018,FOR YOUR CONSIDERATION\n\nOb En\nra wt GUN\n\n...,consideration ob en ra wt gun written david lo...,"[-0.119000636, 0.38204187, 0.09997495, -0.1180...","[-0.09440231, 0.018000484, -0.068600126, -0.03...","[-0.0015843244, -0.015328459, 0.0005569546, 0...."


In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Features (TF-IDF) and target
X_text = data['cleaned_script']
y = data['averageRating']

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Unigrams and bigrams, max 5000 features
X = tfidf.fit_transform(X_text).toarray()

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X = np.array([embedding for embedding in data['word2vec_embeddings']])  # You can choose other embeddings like glove or fasttext
y = data['averageRating'].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
X = np.array([embedding for embedding in data['fasttext_embeddings']])  # You can choose other embeddings like glove or fasttext
y = data['averageRating'].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [147]:
X = np.array([embedding for embedding in data['glove_embeddings']])  # You can choose other embeddings like glove or fasttext
y = data['averageRating'].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
X = np.array([embedding for embedding in data['bert_embeddings']])  # You can choose other embeddings like glove or fasttext
y = data['averageRating'].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Evaluate the model
print("Random Forest MSE:", mean_squared_error(y_test, y_pred_rf))

Random Forest MSE: 0.9631591999999989


TFIDF - 0.79
Glove 100 - 0.855
Glove 300 - 0.83

In [47]:
rf.score(X_test,y_test)

-0.0199986232824112

In [48]:
svr = SVR()
svr.fit(X_train, y_train)
y_pred_svr = svr.predict(X_test)

# Evaluate the model
print("SVR MSE:", mean_squared_error(y_test, y_pred_svr))

SVR MSE: 1.064699064528309


TFIDF - 0.79

In [49]:
# Linear Regression Model
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Evaluate the model
print("Linear Regression MSE:", mean_squared_error(y_test, y_pred_lr))


Linear Regression MSE: 1.0408672940998343


In [50]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    print(f"{model.__class__.__name__}: MSE = {mse:.4f}, MAE = {mae:.4f}")

In [51]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor, BaggingRegressor, StackingRegressor, VotingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np

In [52]:
Linear

NameError: name 'Linear' is not defined

In [53]:
evaluate_model(LinearRegression(), X_train, X_test, y_train, y_test)

LinearRegression: MSE = 1.0409, MAE = 0.7744


TFIDF - 1.3

In [54]:
evaluate_model(Ridge(alpha=1.0), X_train, X_test, y_train, y_test)

Ridge: MSE = 1.0200, MAE = 0.7567


TFIDF - 0.79

In [55]:
evaluate_model(Lasso(alpha=0.1), X_train, X_test, y_train, y_test)

Lasso: MSE = 1.0112, MAE = 0.7597


TFIDF - 0.9

In [56]:
evaluate_model(ElasticNet(alpha=0.1, l1_ratio=0.5), X_train, X_test, y_train, y_test)

ElasticNet: MSE = 1.0112, MAE = 0.7597


In [57]:
evaluate_model(BayesianRidge(), X_train, X_test, y_train, y_test)

BayesianRidge: MSE = 1.0410, MAE = 0.7745


TFIDF - 0.78

Decision Tree

In [58]:
evaluate_model(DecisionTreeRegressor(random_state=42), X_train, X_test, y_train, y_test)


DecisionTreeRegressor: MSE = 1.5780, MAE = 1.0100


TFIDF - 1.4

In [59]:
evaluate_model(RandomForestRegressor(n_estimators=100, random_state=42), X_train, X_test, y_train, y_test)


RandomForestRegressor: MSE = 0.9632, MAE = 0.7339


TFIDF - 0.79

In [129]:
evaluate_model(GradientBoostingRegressor(random_state=42), X_train, X_test, y_train, y_test)


GradientBoostingRegressor: MSE = 0.8161, MAE = 0.6680


TFIDF - 0.81

In [130]:
evaluate_model(AdaBoostRegressor(random_state=42), X_train, X_test, y_train, y_test)


AdaBoostRegressor: MSE = 1.0446, MAE = 0.8160


TFIDF - 1.04

In [131]:
evaluate_model(AdaBoostRegressor(random_state=42), X_train, X_test, y_train, y_test)

AdaBoostRegressor: MSE = 1.0446, MAE = 0.8160


TFIDF - 1.04

In [132]:
evaluate_model(ExtraTreesRegressor(random_state=42), X_train, X_test, y_train, y_test)

ExtraTreesRegressor: MSE = 0.7991, MAE = 0.6662


TFIDF - 0.79

Support Vector Models

In [133]:
evaluate_model(SVR(kernel='rbf', C=1.0), X_train, X_test, y_train, y_test)


SVR: MSE = 0.7998, MAE = 0.6721


TFIDF - 0.79

KNN

In [134]:
evaluate_model(KNeighborsRegressor(n_neighbors=5), X_train, X_test, y_train, y_test)


KNeighborsRegressor: MSE = 1.0116, MAE = 0.7634


TFIDF - 1.0

NN

In [43]:
evaluate_model(MLPRegressor(hidden_layer_sizes=(100,), max_iter=500, random_state=42), X_train, X_test, y_train, y_test)


MLPRegressor: MSE = 1.1487, MAE = 0.8139




TFIDF - 2.0

In [42]:
evaluate_model(GaussianProcessRegressor(random_state=42), X_train, X_test, y_train, y_test)


GaussianProcessRegressor: MSE = 1.6562, MAE = 1.0183


TFIDF - 0.85

In [41]:
evaluate_model(BaggingRegressor( random_state=42), X_train, X_test, y_train, y_test)


BaggingRegressor: MSE = 1.0722, MAE = 0.8105


TFIDF - 0.84

In [138]:
estimators = [
    ('lr', LinearRegression()),
    ('rf', RandomForestRegressor(n_estimators=50, random_state=42))
]
stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=GradientBoostingRegressor())
evaluate_model(stacking_regressor, X_train, X_test, y_train, y_test)


StackingRegressor: MSE = 0.7994, MAE = 0.6811


TFIDF - 0.79

In [139]:
voting_regressor = VotingRegressor([
    ('lr', LinearRegression()),
    ('rf', RandomForestRegressor(n_estimators=50, random_state=42)),
    ('svr', SVR(kernel='rbf'))
])
evaluate_model(voting_regressor, X_train, X_test, y_train, y_test)


VotingRegressor: MSE = 0.7954, MAE = 0.6755


In [140]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

poly_model = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
evaluate_model(poly_model, X_train, X_test, y_train, y_test)

MemoryError: Unable to allocate 195. GiB for an array with shape (2096, 12507501) and data type float64

In [None]:
from sklearn.linear_model import TheilSenRegressor
evaluate_model(TheilSenRegressor(random_state=42), X_train, X_test, y_train, y_test)