## Preparing the Dataset

We add emotion analysis of the description to the dataset and create tiers of movie rankings. 


In [None]:
import os
import pandas as pd
import numpy as np 
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, Trainer
os.chdir("C:\\Users\\Hithesh\\Desktop\\Datawise\\ML\\netflix")
df = pd.read_csv("n_movies.csv")
df.head()

df.columns
np.min(df["rating"]) ## 1.7
np.max(df["rating"]) ## 9.9

df["tiers"] = np.ceil(df["rating"])

# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}
    
model_name = "j-hartmann/emotion-english-distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model)

#classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True) ##hugging-hub

pred_texts = df["description"].astype('str').tolist()
tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)
predictions = trainer.predict(pred_dataset)

preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

# scores raw
temp = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))

# work in progress
# container
anger = []
disgust = []
fear = []
joy = []
neutral = []
sadness = []
surprise = []

# extract scores (as many entries as exist in pred_texts)
for i in range(len(pred_texts)):
  anger.append(temp[i][0])
  disgust.append(temp[i][1])
  fear.append(temp[i][2])
  joy.append(temp[i][3])
  neutral.append(temp[i][4])
  sadness.append(temp[i][5])
  surprise.append(temp[i][6])
  

df = df.assign(**{'anger':anger, 'disgust':disgust, 'fear':fear, 'joy':joy, 'neutral':neutral, 
             'sadness':sadness, 'surprise':surprise})

new_file = "n_movies_emomods.csv"
df.to_csv(new_file)


## Regression
We first load in our newly created dataset and the scikit.learn library to run our regression. 
We choose a straightforward linear regression first to look at the result and classification. 

In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression

nmovies = pd.read_csv(n_movies_emomods.csv)

lr = LinearRegression()
lr.fix()