# 2. Transformación

## 2.1 Importando paquetes y datasets

In [1]:
import pandas as pd
import nltk
import math
import io
import re
import ast
import warnings
warnings.filterwarnings('ignore')

#Imports de vectorizacion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

In [2]:
# with open("res/preprocesado_training.csv", "r", encoding='utf-8') as f:
#     df_traing = pd.read_csv(f, encoding="utf-8")
#     print("Dataset loaded")

In [3]:
# with open("res/preprocesado_testing.csv", "r", encoding='utf-8') as f:
#     df_test = pd.read_csv(f, encoding="utf-8")
#     print("Dataset loaded")

In [4]:
with open("movies_dataset/preprocessed_training.csv", "r", encoding='utf-8') as f:
    df_train = pd.read_csv(f, encoding="utf-8")
    print("Training dataset loaded")

Training dataset loaded


In [5]:
with open("movies_dataset/preprocessed_testing.csv", "r", encoding='utf-8') as f:
    df_test = pd.read_csv(f, encoding="utf-8")
    print("Testing dataset loaded")

Testing dataset loaded


## 2.1 Transformación de lenguaje natural

### 2.1.1 Transformación del training

Una vez realizado el tokenizado de las columnas compuestas por texto plano, la siguiente labor será la de vectorizar estas columnas, de forma que un algoritmo pueda procesar estos datos. Para ello se cogerán todas aquellas columnas de lenguaje natural y se tomarán todas las palabras aparecidas en ellas.

In [6]:
def to_plain(row):
    text = ''
    items = ast.literal_eval(row)
    items = set(items)
    for item in items:
        text += f' {item}'
    return text

In [7]:
title = df_train['title'].apply(to_plain)
title.head()

0                story toy
1                  jumanji
2           waiting exhale
3     ii part father bride
4                     heat
Name: title, dtype: object

In [8]:
tagline = df_train['tagline'].apply(to_plain)
tagline.head()

0                                       
1           excitement dice roll unleash
2         never friend people forget let
3     life back world surprise he normal
4                 los angeles crime saga
Name: tagline, dtype: object

In [9]:
overview = df_train['overview'].apply(to_plain)
overview.head()

0     separate brings duo andys learns owner birthd...
1     prof terrifying open invite running enchanted...
2     breath cheated good vannah robin better strin...
3     wife plan news bank change grandchild arrival...
4     catandmouse crew pursues end thief throughout...
Name: overview, dtype: object

Una vez pasado a texto plano todas las columnas, se vectorizarán cada una de estas características de los datos para training.

In [10]:
def best_words(raw):
    vec = CountVectorizer().fit(raw)
    words = vec.transform(raw)
    sum_words = words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    
    n = int(words_freq[0][1] * 0.3)
    best_words = []
    for word in words_freq:
        if word[1] >= n:
            best_words.append(word[0])
        else:
            break
    return best_words

In [11]:
overview_words = best_words(overview)
title_words= best_words(title)
tagline_words = best_words(tagline)

In [12]:
def treat_text(df, best_words, column):
    for word in best_words:
        df[f'{column}_{word}'] = 0
        
    for i, row in df.iterrows():
        try:
            items = ast.literal_eval(row[column])
            for word in items:
                if word in best_words:
                    df.iloc[i, df.columns.get_loc(f'{column}_{word}')] = 1
        except:
            continue

Las palabras con más ocurrencias de cada características serán añadidas como columnas.

In [13]:
treat_text(df_train, overview_words, 'overview')

In [14]:
treat_text(df_train, title_words, 'title')

In [15]:
treat_text(df_train, tagline_words, 'tagline')

In [16]:
df_train.head()

Unnamed: 0,belongs_to_collection,budget,id,original_language,overview,production_companies,revenue,runtime,tagline,title,...,tagline_life,tagline_story,tagline_world,tagline_man,tagline_never,tagline_time,tagline_get,tagline_back,tagline_come,tagline_he
0,10194,30000000,862,en,"['led', 'woody', 'andys', 'toy', 'live', 'happ...",52,373554033.0,81.0,[],"['toy', 'story']",...,0,0,0,0,0,0,0,0,0,0
1,-1,65000000,8844,en,"['sibling', 'judy', 'peter', 'discover', 'ench...",196,262797249.0,104.0,"['roll', 'dice', 'unleash', 'excitement']",['jumanji'],...,0,0,0,0,0,0,0,0,0,0
2,-1,16000000,31357,en,"['cheated', 'mistreated', 'stepped', 'woman', ...",812,81452156.0,127.0,"['friend', 'people', 'let', 'never', 'let', 'f...","['waiting', 'exhale']",...,0,0,0,0,1,0,0,0,0,0
3,96871,0,11862,en,"['george', 'bank', 'recovered', 'daughter', 'w...",224,76578911.0,106.0,"['world', 'back', 'normal', 'he', 'surprise', ...","['father', 'bride', 'part', 'ii']",...,1,0,1,0,0,0,0,1,0,1
4,-1,60000000,949,en,"['obsessive', 'master', 'thief', 'neil', 'mcca...",1184,187436818.0,170.0,"['los', 'angeles', 'crime', 'saga']",['heat'],...,0,0,0,0,0,0,0,0,0,0


In [17]:
df_train.drop(['title', 'tagline', 'overview'], axis=1, inplace=True)

### 2.1.2 Transformación del testing

Para el dataset para testing, también se añadirán estas mismas columnas con One Hot Encoding, si aparacen en cada una de las columnas ya mencionadas anteriormente.

In [18]:
treat_text(df_test, overview_words, 'overview')

In [19]:
treat_text(df_test, title_words, 'title')

In [20]:
treat_text(df_test, tagline_words, 'tagline')

In [21]:
df_test.drop(['title', 'tagline', 'overview'], axis=1, inplace=True)

## 2.2 Transformación de columnas categóricas

### 2.2.1 Release day of the week

Esta columna que indica el día de la semana, será procesada mediante Label Encoding, tanto del training y del testing.

In [22]:
lb_enc = LabelEncoder()

df_train['release_day'] = lb_enc.fit_transform(df_train['release_day'])
df_test['release_day'] = lb_enc.fit_transform(df_test['release_day'])

### 2.2.2 Original language

Esta columna también será procesada mediante Label Encoding.

In [23]:
df_train['original_language'] = lb_enc.fit_transform(df_train['original_language'])
df_test['original_language'] = lb_enc.fit_transform(df_test['original_language'])

## 2.3 Actualización de los datasets

In [24]:
df_train.to_csv("movies_dataset/transformed_training.csv", index = False)

In [25]:
df_test.to_csv("movies_dataset/transformed_testing.csv", index = False)