# 2. Transformación

## 2.1 Importando paquetes y datasets

In [38]:
import pandas as pd
import nltk
import math
import io
import re
import ast
import warnings
warnings.filterwarnings('ignore')

#Imports de vectorizacion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

In [None]:
# with open("res/preprocesado_training.csv", "r", encoding='utf-8') as f:
#     df_traing = pd.read_csv(f, encoding="utf-8")
#     print("Dataset loaded")

In [None]:
# with open("res/preprocesado_testing.csv", "r", encoding='utf-8') as f:
#     df_test = pd.read_csv(f, encoding="utf-8")
#     print("Dataset loaded")

In [6]:
with open("res/movies_training.csv", "r", encoding='utf-8') as f:
    df_train = pd.read_csv(f, encoding="utf-8")
    print("Dataset loaded")

Dataset loaded


In [7]:
with open("res/movies_testing.csv", "r", encoding='utf-8') as f:
    df_test = pd.read_csv(f, encoding="utf-8")
    print("Dataset loaded")

Dataset loaded


## 2.1 Transformación de lenguaje natural

### 2.1.1 Transformación del training

Una vez realizado el tokenizado de las columnas compuestas por texto plano, la siguiente labor será la de vectorizar estas columnas, de forma que un algoritmo pueda procesar estos datos. Para ello se cogerán todas aquellas columnas de lenguaje natural y se tomarán todas las palabras aparecidas en ellas.

In [78]:
def to_plain(row):
    text = ''
    items = ast.literal_eval(row)
    items = set(items)
    for item in items:
        text += f' {item}'
    return text

In [79]:
title = df_train['title'].apply(to_plain)
title.head()

0                story toy
1                  jumanji
2           exhale waiting
3     bride father ii part
4                     heat
Name: title, dtype: object

In [75]:
tagline = df_train['tagline'].apply(to_plain)
tagline.head()

0                                       
1           excitement unleash dice roll
2         let friend never people forget
3     he world normal back surprise life
4                 angeles crime saga los
Name: tagline, dtype: object

In [60]:
overview = df_train['overview'].apply(to_plain)
overview.head()

0     brings onto separate andys room duo live plac...
1     trapped game world board door room living fre...
2     talk breathe bernie vannah friend cheated bre...
3     wife change recovered home grandchild expecti...
4     topnotch thief game man obsessive vincent abi...
Name: overview, dtype: object

Una vez pasado a texto plano todas las columnas, se vectorizarán cada una de estas características de los datos para training.

In [88]:
def best_words(raw):
    vec = CountVectorizer().fit(raw)
    words = vec.transform(raw)
    sum_words = words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    
    n = int(words_freq[0][1] * 0.3)
    best_words = []
    for word in words_freq:
        if word[1] >= n:
            best_words.append(word[0])
        else:
            break
    return best_words

In [94]:
overview_words = best_words(overview)
title_words= best_words(title)
tagline_words = best_words(tagline)

In [91]:
def treat_text(df, best_words, column):
    for word in best_words:
        df[f'{column}_{word}'] = 0
        
    for i, row in df.iterrows():
        try:
            items = ast.literal_eval(row[column])
            for word in items:
                if word in best_words:
                    df.iloc[i, df.columns.get_loc(f'{column}_{word}')] = 1
        except:
            continue

Las palabras con más ocurrencias de cada características serán añadidas como columnas.

In [92]:
treat_text(df_train, overview_words, 'overview')

In [95]:
treat_text(df_train, title_words, 'title')

In [96]:
treat_text(df_train, tagline_words, 'tagline')

In [97]:
df_train.head()

Unnamed: 0,belongs_to_collection,budget,original_language,overview,production_companies,revenue,runtime,tagline,title,vote_average,...,tagline_life,tagline_story,tagline_world,tagline_man,tagline_never,tagline_time,tagline_get,tagline_back,tagline_come,tagline_he
0,-1,30000000,en,"['led', 'woody', 'andys', 'toy', 'live', 'happ...",52,373554033.0,81.0,[],"['toy', 'story']",7.7,...,0,0,0,0,0,0,0,0,0,0
1,-1,65000000,en,"['sibling', 'judy', 'peter', 'discover', 'ench...",196,262797249.0,104.0,"['roll', 'dice', 'unleash', 'excitement']",['jumanji'],6.9,...,0,0,0,0,0,0,0,0,0,0
2,-1,16000000,en,"['cheated', 'mistreated', 'stepped', 'woman', ...",812,81452156.0,127.0,"['friend', 'people', 'let', 'never', 'let', 'f...","['waiting', 'exhale']",6.1,...,0,0,0,0,1,0,0,0,0,0
3,-1,0,en,"['george', 'bank', 'recovered', 'daughter', 'w...",224,76578911.0,106.0,"['world', 'back', 'normal', 'he', 'surprise', ...","['father', 'bride', 'part', 'ii']",5.7,...,1,0,1,0,0,0,0,1,0,1
4,-1,60000000,en,"['obsessive', 'master', 'thief', 'neil', 'mcca...",1184,187436818.0,170.0,"['los', 'angeles', 'crime', 'saga']",['heat'],7.7,...,0,0,0,0,0,0,0,0,0,0


In [100]:
df_train.drop(['title', 'tagline', 'overview'], axis=1, inplace=True)

### 2.1.2 Transformación del testing

Para el dataset para testing, también se añadirán estas mismas columnas con One Hot Encoding, si aparacen en cada una de las columnas ya mencionadas anteriormente.

In [None]:
treat_text(df_test, overview_words, 'overview')

In [None]:
treat_text(df_test, title_words, 'title')

In [None]:
treat_text(df_test, tagline_words, 'tagline')

In [101]:
df_test.drop(['title', 'tagline', 'overview'], axis=1, inplace=True)

## 2.2 Transformación de columnas categóricas

### 2.2.1 Release day of the week

Esta columna que indica el día de la semana, será procesada mediante Label Encoding, tanto del training y del testing.

In [None]:
lb_enc = LabelEncoder()

df_train['release_day'] = lb_enc.fit_transform(df_train['release_day'])
df_test['release_day'] = lb_enc.fit_transform(df_test['release_day'])

### 2.2.2 Original language

Esta columna también será procesada mediante Label Encoding.

In [None]:
df_train['original_language'] = lb_enc.fit_transform(df_train['original_language'])
df_test['original_language'] = lb_enc.fit_transform(df_test['original_language'])

## 2.3 Actualización de los datasets

In [None]:
df_train.to_csv("res/transformed_training.csv", index = False)

In [None]:
df_test.to_csv("res/transformed_testing.csv", index = False)