In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('all_streaming.csv')

In [3]:
df.head(5)

Unnamed: 0,movie_or_serie,title,director,cast,country,date_added_platform,release_year,duration_seconds,gender_type,description,channel_streaming
0,Movie,ricky velez: here's everything,uninformed director,uninformed cast,uninformed country,"October 24, 2021",2021,,"comedy, stand up",​Comedian Ricky Velez bares it all with his ho...,hulu-movies-and-tv-shows
1,Movie,silent night,uninformed director,uninformed cast,uninformed country,"October 23, 2021",2020,94 min,"crime, drama, thriller","Mark, a low end South London hitman recently r...",hulu-movies-and-tv-shows
2,Movie,the marksman,uninformed director,uninformed cast,uninformed country,"October 23, 2021",2021,108 min,"action, thriller",A hardened Arizona rancher tries to protect an...,hulu-movies-and-tv-shows
3,Movie,gaia,uninformed director,uninformed cast,uninformed country,"October 22, 2021",2021,97 min,horror,A forest ranger and two survivalists with a cu...,hulu-movies-and-tv-shows
4,Movie,settlers,uninformed director,uninformed cast,uninformed country,"October 22, 2021",2021,104 min,"science fiction, thriller",Mankind's earliest settlers on the Martian fro...,hulu-movies-and-tv-shows


Start with a preliminary analysis

In [4]:
#Visualize the shape of the data set
df.shape

(22998, 11)

In [5]:
#General information on the data set
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22998 entries, 0 to 22997
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   movie_or_serie       22998 non-null  object
 1   title                22998 non-null  object
 2   director             22998 non-null  object
 3   cast                 22998 non-null  object
 4   country              22998 non-null  object
 5   date_added_platform  13444 non-null  object
 6   release_year         22998 non-null  int64 
 7   duration_seconds     22516 non-null  object
 8   gender_type          22998 non-null  object
 9   description          22994 non-null  object
 10  channel_streaming    22998 non-null  object
dtypes: int64(1), object(10)
memory usage: 1.9+ MB


In [6]:
# Check for missing values
print(df.isna().sum())

movie_or_serie            0
title                     0
director                  0
cast                      0
country                   0
date_added_platform    9554
release_year              0
duration_seconds        482
gender_type               0
description               4
channel_streaming         0
dtype: int64


In [7]:
#There are some columns with missing data, we can create another dataframe without the rows with Nans
df_non_null = df.dropna().reset_index()
df_non_null.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12961 entries, 0 to 12960
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   index                12961 non-null  int64 
 1   movie_or_serie       12961 non-null  object
 2   title                12961 non-null  object
 3   director             12961 non-null  object
 4   cast                 12961 non-null  object
 5   country              12961 non-null  object
 6   date_added_platform  12961 non-null  object
 7   release_year         12961 non-null  int64 
 8   duration_seconds     12961 non-null  object
 9   gender_type          12961 non-null  object
 10  description          12961 non-null  object
 11  channel_streaming    12961 non-null  object
dtypes: int64(2), object(10)
memory usage: 1.2+ MB


In [8]:
# Check for the gender of the movies

print(df_non_null['gender_type'].value_counts())

gender_type
documentaries                             509
dramas, international ,                   362
stand,up comedy                           334
comedies, dramas, international ,         274
dramas, independent ,, international ,    252
                                         ... 
anthology, science fiction                  1
musical, romance, western                   1
animals , nature, anthology                 1
biographical, coming of age, drama          1
cult ,, dramas, thrillers                   1
Name: count, Length: 1265, dtype: int64


In [9]:
#Better way to visualize the classification is through the creation of a new matrix
#Import one hot encoder

from sklearn.preprocessing import OneHotEncoder

In [10]:
df_non_null['gender_type'][0]

'crime, drama, thriller'

In [11]:
#With split method we can extract the single strings and put them into a list

print(df_non_null['gender_type'][1].split(","))

['action', ' thriller']


In [12]:
df_non_null['gender_type1'] = df_non_null['gender_type'].str.split(', ')
df_non_null.head()

Unnamed: 0,index,movie_or_serie,title,director,cast,country,date_added_platform,release_year,duration_seconds,gender_type,description,channel_streaming,gender_type1
0,1,Movie,silent night,uninformed director,uninformed cast,uninformed country,"October 23, 2021",2020,94 min,"crime, drama, thriller","Mark, a low end South London hitman recently r...",hulu-movies-and-tv-shows,"[crime, drama, thriller]"
1,2,Movie,the marksman,uninformed director,uninformed cast,uninformed country,"October 23, 2021",2021,108 min,"action, thriller",A hardened Arizona rancher tries to protect an...,hulu-movies-and-tv-shows,"[action, thriller]"
2,3,Movie,gaia,uninformed director,uninformed cast,uninformed country,"October 22, 2021",2021,97 min,horror,A forest ranger and two survivalists with a cu...,hulu-movies-and-tv-shows,[horror]
3,4,Movie,settlers,uninformed director,uninformed cast,uninformed country,"October 22, 2021",2021,104 min,"science fiction, thriller",Mankind's earliest settlers on the Martian fro...,hulu-movies-and-tv-shows,"[science fiction, thriller]"
4,5,TV Show,the halloween candy magic pet,uninformed director,uninformed cast,uninformed country,"October 22, 2021",2021,1 Season,"family, kids",Join Mila and Morphle on a mystery-filled Hall...,hulu-movies-and-tv-shows,"[family, kids]"


In [26]:
df_non_null['gender_type1'][0]

['crime', 'drama', 'thriller']

In [14]:
from sklearn.preprocessing import MultiLabelBinarizer 

In [15]:
mlb = MultiLabelBinarizer()

In [16]:
mlb.fit_transform(df_non_null["gender_type1"])

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [17]:
new_columns = [ x.strip() for x in mlb.classes_]
new_columns

['',
 ',',
 'action',
 'action',
 'action,adventure',
 'adult animation',
 'adventure',
 'and culture',
 'animals',
 'animation',
 'anime',
 'anime features',
 'anime series',
 'anthology',
 'arts',
 'biographical',
 'black stories',
 'british ,',
 'buddy',
 'cartoons',
 'children',
 'classic',
 'classic ,',
 'classics',
 'comedies',
 'comedy',
 'coming of age',
 'competition',
 'concert film',
 'cooking',
 'crime',
 'crime ,',
 'cult ,',
 'culture',
 'dance',
 'disaster',
 'documentaries',
 'documentary',
 'docuseries',
 'drama',
 'dramas',
 'entertainment',
 'faith',
 'faith and spirituality',
 'family',
 'family ,',
 'fantasy',
 'food',
 'game show',
 'game shows',
 'health',
 'historical',
 'history',
 'horror',
 'horror ,',
 'independent ,',
 'international',
 'international ,',
 'kids',
 "kids' ,",
 'korean ,',
 'late night',
 'latino',
 'lgbtq',
 'lgbtq ,',
 'lgbtq+',
 'lifestyle',
 'lifestyle',
 'medical',
 'melodrama',
 'music',
 'music',
 'musical',
 'musicals',
 'mysteries',

In [18]:
new_columns.remove('')
new_columns.remove(',')
new_columns

['action',
 'action',
 'action,adventure',
 'adult animation',
 'adventure',
 'and culture',
 'animals',
 'animation',
 'anime',
 'anime features',
 'anime series',
 'anthology',
 'arts',
 'biographical',
 'black stories',
 'british ,',
 'buddy',
 'cartoons',
 'children',
 'classic',
 'classic ,',
 'classics',
 'comedies',
 'comedy',
 'coming of age',
 'competition',
 'concert film',
 'cooking',
 'crime',
 'crime ,',
 'cult ,',
 'culture',
 'dance',
 'disaster',
 'documentaries',
 'documentary',
 'docuseries',
 'drama',
 'dramas',
 'entertainment',
 'faith',
 'faith and spirituality',
 'family',
 'family ,',
 'fantasy',
 'food',
 'game show',
 'game shows',
 'health',
 'historical',
 'history',
 'horror',
 'horror ,',
 'independent ,',
 'international',
 'international ,',
 'kids',
 "kids' ,",
 'korean ,',
 'late night',
 'latino',
 'lgbtq',
 'lgbtq ,',
 'lgbtq+',
 'lifestyle',
 'lifestyle',
 'medical',
 'melodrama',
 'music',
 'music',
 'musical',
 'musicals',
 'mysteries',
 'mystery'

In [216]:
import spacy

# Cargar el modelo de lenguaje de spaCy
nlp = spacy.load("en_core_web_sm")

grupos={}
grupos['caracteres']=new_columns

# Agrupar las palabras basadas en la similitud semántica
grupos_simplificados = {}

for num_grupo, lista_palabras in grupos.items():
    grupo_simplificado = []
    for palabra in lista_palabras:
        for _, lista_simplificada in grupos_simplificados.items():
            for palabra_simplificada in lista_simplificada:
                if nlp(palabra).similarity(nlp(palabra_simplificada)) > 0.7:
                    grupo_simplificado.append(palabra_simplificada)
                    break
            else:
                continue
            break
        else:
            grupo_simplificado.append(palabra)
    grupos_simplificados[num_grupo] = list(set(grupo_simplificado))

for i in range(len(lista_palabras)):
    # Reemplazar las comas por espacios en cada elemento
    lista_palabras[i] =lista_palabras[i].replace(',', ' ')
    
# Imprimir los grupos simplificados
for num_grupo, lista_palabras in grupos_simplificados.items():
    print(lista_palabras)
    

['suspense', 'latino', 'anime features', 'series', 'anthology', 'british  ', 'cult  ', 'culture', 'wellness', 'soap opera', 'adventure', 'arts', 'music', 'musical', 'western', 'biographical', 'docuseries', 'special interest', 'animals', 'action', 'dramas', 'action adventure', 'drama', 'mystery', 'lifestyle', 'cooking', 'game shows', 'sports  ', 'stand up', 'talk show', 'musicals', 'fantasy', 'concert film', 'history', "kids'  ", 'superhero', 'horror  ', 'horror', 'thriller', 'melodrama', 'romantic comedy', 'survival', 'competition', 'coming of age', 'documentary', 'variety', 'young adult audience', 'adult animation', 'news', 'spy espionage', 'technology', 'lgbtq', 'classics', 'talk show and variety', 'science', 'parody', 'lgbtq+', 'family  ', 'dance', 'travel', 'stand up comedy', 'faith and spirituality', 'children', 'and culture', 'anime', 'game show', 'documentaries', 'historical', 'international', 'animation', 'family', 'thrillers', 'buddy', 'lgbtq  ', 'reality', 'medical', 'reality

In [221]:
import spacy
import pandas as pd

# Cargar el modelo de lenguaje de spaCy
nlp = spacy.load("en_core_web_sm")

# Función para modificar las listas de la columna
def modificar_listas(lista_palabras, df_non_null):
    for index, row in df_non_null.iterrows():
        lista_columna = row['gender_type1']
        for i, palabra_columna in enumerate(lista_columna):
            doc_columna = nlp(palabra_columna)
            for palabra_palabras in lista_palabras:
                doc_palabras = nlp(palabra_palabras)
                for token_columna in doc_columna:
                    for token_palabras in doc_palabras:
                        if token_columna.similarity(token_palabras) > 0.7:
                            lista_columna[i] = palabra_palabras
                            break
                    else:
                        continue
                    break
        df_non_null.at[index, 'gender_type1'] = lista_columna
    return df_non_null


# Modificar las listas de la columna
df_modificado = modificar_listas(lista_palabras, df_non_null)

# Imprimir el DataFrame modificado
print(df_modificado)




  if token_columna.similarity(token_palabras) > 0.7:


       index movie_or_serie                          title  \
0          1          Movie                   silent night   
1          2          Movie                   the marksman   
2          3          Movie                           gaia   
3          4          Movie                       settlers   
4          5        TV Show  the halloween candy magic pet   
...      ...            ...                            ...   
12956  22993          Movie                         zodiac   
12957  22994        TV Show                    zombie dumb   
12958  22995          Movie                     zombieland   
12959  22996          Movie                           zoom   
12960  22997          Movie                         zubaan   

                  director                                               cast  \
0      uninformed director                                    uninformed cast   
1      uninformed director                                    uninformed cast   
2      uninf

In [222]:

df_modificado.head()


Unnamed: 0,index,movie_or_serie,title,director,cast,country,date_added_platform,release_year,duration_seconds,gender_type,description,channel_streaming,gender_type1
0,1,Movie,silent night,uninformed director,uninformed cast,uninformed country,"October 23, 2021",2020,94 min,"crime, drama, thriller","Mark, a low end South London hitman recently r...",hulu-movies-and-tv-shows,"[food, food, food]"
1,2,Movie,the marksman,uninformed director,uninformed cast,uninformed country,"October 23, 2021",2021,108 min,"action, thriller",A hardened Arizona rancher tries to protect an...,hulu-movies-and-tv-shows,"[food, food]"
2,3,Movie,gaia,uninformed director,uninformed cast,uninformed country,"October 22, 2021",2021,97 min,horror,A forest ranger and two survivalists with a cu...,hulu-movies-and-tv-shows,[health]
3,4,Movie,settlers,uninformed director,uninformed cast,uninformed country,"October 22, 2021",2021,104 min,"science fiction, thriller",Mankind's earliest settlers on the Martian fro...,hulu-movies-and-tv-shows,"[science fiction, food]"
4,5,TV Show,the halloween candy magic pet,uninformed director,uninformed cast,uninformed country,"October 22, 2021",2021,1 Season,"family, kids",Join Mila and Morphle on a mystery-filled Hall...,hulu-movies-and-tv-shows,"[food, kids]"


In [225]:
# Eliminar duplicados en las listas de la columna 'lista'
df_modificado['gender_type1'] = df_modificado['gender_type1'].apply(lambda x: list(dict.fromkeys(x)))

# Obtener todos los nombres únicos
unique_names = set([name for sublist in df_modificado['gender_type1'] for name in sublist])

# Crear columnas con 0 como valor inicial
for name in unique_names:
    df_modificado[name] = 0

# Actualizar las columnas con 1 si el nombre está en la lista
for index, row in df_modificado.iterrows():
    for name in row['gender_type1']:
        df_modificado.at[index, name] = 1

# Eliminar la columna 'lista' si ya no la necesitas
df_modificado.drop(columns=['gender_type1'], inplace=True)
df_modificado.head()

Unnamed: 0,index,movie_or_serie,title,director,cast,country,date_added_platform,release_year,duration_seconds,gender_type,...,action adventure,classic,spanish language,spy espionage,crime,food,lifestyle,independent,nature,crime.1
0,1,Movie,silent night,uninformed director,uninformed cast,uninformed country,"October 23, 2021",2020,94 min,"crime, drama, thriller",...,0,0,0,0,0,1,0,0,0,0
1,2,Movie,the marksman,uninformed director,uninformed cast,uninformed country,"October 23, 2021",2021,108 min,"action, thriller",...,0,0,0,0,0,1,0,0,0,0
2,3,Movie,gaia,uninformed director,uninformed cast,uninformed country,"October 22, 2021",2021,97 min,horror,...,0,0,0,0,0,0,0,0,0,0
3,4,Movie,settlers,uninformed director,uninformed cast,uninformed country,"October 22, 2021",2021,104 min,"science fiction, thriller",...,0,0,0,0,0,1,0,0,0,0
4,5,TV Show,the halloween candy magic pet,uninformed director,uninformed cast,uninformed country,"October 22, 2021",2021,1 Season,"family, kids",...,0,0,0,0,0,1,0,0,0,0


In [197]:
from nltk.stem import PorterStemmer

# Inicializar el stemmer
stemmer = PorterStemmer()

# Función para aplicar stemming a un nombre
def stem_name(name):
    return stemmer.stem(name)

# Supongamos que 'new_columns' es una lista de nombres
#new_columns = ["Nombre1", "Nombre2", "Nombre3"]

# Aplicar stemming a cada nombre
nombres_stemmed = [stem_name(name) for name in new_columns]

# Crear un diccionario para almacenar los grupos de nombres según su raíz
grupos_raices = {}
for idx, nombre_stemmed in enumerate(nombres_stemmed):
    if nombre_stemmed not in grupos_raices:
        grupos_raices[nombre_stemmed] = []
    grupos_raices[nombre_stemmed].append(new_columns[idx])

# Mostrar los grupos resultantes
for raiz, names in grupos_raices.items():
    print(f"Raíz: {raiz}, Nombres: {names}")



Raíz: action, Nombres: ['action', 'action']
Raíz: action,adventur, Nombres: ['action,adventure']
Raíz: adult anim, Nombres: ['adult animation']
Raíz: adventur, Nombres: ['adventure']
Raíz: and cultur, Nombres: ['and culture']
Raíz: anim, Nombres: ['animals', 'animation', 'anime']
Raíz: anime featur, Nombres: ['anime features']
Raíz: anime seri, Nombres: ['anime series']
Raíz: antholog, Nombres: ['anthology']
Raíz: art, Nombres: ['arts']
Raíz: biograph, Nombres: ['biographical']
Raíz: black stori, Nombres: ['black stories']
Raíz: british ,, Nombres: ['british ,']
Raíz: buddi, Nombres: ['buddy']
Raíz: cartoon, Nombres: ['cartoons']
Raíz: children, Nombres: ['children']
Raíz: classic, Nombres: ['classic', 'classics']
Raíz: classic ,, Nombres: ['classic ,']
Raíz: comedi, Nombres: ['comedies', 'comedy']
Raíz: coming of ag, Nombres: ['coming of age']
Raíz: competit, Nombres: ['competition']
Raíz: concert film, Nombres: ['concert film']
Raíz: cook, Nombres: ['cooking']
Raíz: crime, Nombres: [

In [201]:
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
import numpy as np


# Tokenizar los nombres en listas de caracteres
tokenized_names = [list(name.lower()) for name in new_columns]

# Entrenar un modelo Word2Vec
model = Word2Vec(sentences=tokenized_names, vector_size=10, window=5, min_count=1, workers=4)

# Obtener vectores para cada nombre
name_vectors = [np.mean([model.wv[char] for char in name], axis=0) for name in tokenized_names]


# Usar K-Means para agrupar los vectores
num_clusters = 14  # ajusta según tus necesidades
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(name_vectors)

# Crear un diccionario para almacenar los grupos de nombres
grupos = {}
for idx, cluster_id in enumerate(clusters):
    if cluster_id not in grupos:
        grupos[cluster_id] = []
    grupos[cluster_id].append(new_columns[idx])

# Mostrar los grupos resultantes
for cluster_id, names in grupos.items():
    print(f"Grupo {cluster_id + 1}: {names}")


Grupo 4: ['action', 'action', 'adult animation', 'animation', 'coming of age', 'competition', 'cooking', 'international', 'international ,', 'latino']
Grupo 8: ['action,adventure', 'concert film', 'dance', 'late night', 'nature ,', 'police,cop', 'reality ,', 'romantic ,', 'science fiction', 'young adult audience']
Grupo 10: ['adventure', 'documentaries', 'documentary', 'faith', 'fantasy', 'nature', 'reality', 'romance', 'romantic comedy', 'sketch comedy', 'travel', 'variety']
Grupo 2: ['and culture', 'british ,', 'children', 'culture', 'lifestyle', 'lifestyle', 'spanish,language ,', 'sports', 'sports ,', 'spy,espionage', 'stand up', 'stand,up comedy', 'stand,up comedy', 'thrillers', 'unscripted']
Grupo 1: ['animals', 'biographical', 'comedy', 'drama', 'family', 'family ,', 'health', 'medical', 'melodrama', 'parody']
Grupo 5: ['anime', 'anime features', 'anime series', 'comedies', 'crime', 'crime ,', 'game show', 'game shows', 'series', 'soap opera']
Grupo 9: ['anthology', 'cartoons', '

In [207]:
import spacy

# Cargar el modelo de lenguaje de spaCy
nlp = spacy.load("en_core_web_sm")

# Definir las listas originales de los grupos
grupos = {
    4: ['action', 'adult animation', 'animation', 'coming of age', 'competition', 'cooking', 'international', 'latino'],
    8: ['action,adventure', 'concert film', 'dance', 'late night', 'nature', 'police,cop', 'reality', 'romantic', 'science fiction', 'young adult audience'],
    10: ['adventure', 'documentaries', 'faith', 'fantasy', 'nature', 'reality', 'romance', 'romantic comedy', 'sketch comedy', 'travel', 'variety'],
    2: ['culture', 'lifestyle', 'spanish', 'sports', 'spy,espionage', 'stand up', 'thrillers', 'unscripted'],
    1: ['animals', 'biographical', 'comedy', 'drama', 'family', 'health', 'medical', 'melodrama', 'parody'],
    5: ['anime', 'comedies', 'crime', 'game show', 'series', 'soap opera'],
    9: ['anthology', 'cartoons', 'technology'],
    3: ['arts', 'buddy', 'docuseries', 'dramas', 'mysteries', 'mystery', 'superhero'],
    12: ['black stories', 'disaster', 'faith and spirituality', 'historical', 'korean', 'sitcom', 'spirituality', 'talk show', 'talk show and variety', 'talk shows', 'thriller'],
    14: ['classic', "kids'", 'sci,fi', 'suspense', 'wellness'],
    13: ['cult', 'lgbtq', 'lgbtq+'],
    7: ['entertainment', 'independent', 'news', 'science', 'special interest', 'teen', 'western'],
    11: ['food', 'history', 'horror'],
    6: ['kids', 'music', 'musical', 'musicals', 'survival']
}

# Agrupar las palabras basadas en la similitud semántica
grupos_simplificados = {}

for num_grupo, lista_palabras in grupos.items():
    grupo_simplificado = []
    for palabra in lista_palabras:
        for _, lista_simplificada in grupos_simplificados.items():
            for palabra_simplificada in lista_simplificada:
                if nlp(palabra).similarity(nlp(palabra_simplificada)) > 0.7:
                    grupo_simplificado.append(palabra_simplificada)
                    break
            else:
                continue
            break
        else:
            grupo_simplificado.append(palabra)
    grupos_simplificados[num_grupo] = list(set(grupo_simplificado))

# Imprimir los grupos simplificados
for num_grupo, lista_palabras in grupos_simplificados.items():
    print(f"Grupo {num_grupo}: {lista_palabras}")


  if nlp(palabra).similarity(nlp(palabra_simplificada)) > 0.7:


Grupo 4: ['international', 'animation', 'latino', 'competition', 'adult animation', 'action', 'coming of age', 'cooking']
Grupo 8: ['romantic', 'competition', 'adult animation', 'late night', 'dance', 'action,adventure', 'police,cop']
Grupo 10: ['fantasy', 'romantic comedy', 'animation', 'competition', 'documentaries', 'sketch comedy']
Grupo 2: ['sports', 'international', 'animation', 'stand up', 'documentaries', 'unscripted', 'lifestyle', 'action,adventure']
Grupo 1: ['international', 'animation', 'latino', 'competition', 'animals']
Grupo 5: ['animation', 'adult animation', 'documentaries', 'anime', 'lifestyle']
Grupo 9: ['sports', 'animation']
Grupo 3: ['lifestyle', 'animation', 'documentaries']
Grupo 12: ['korean', 'international', 'animation', 'faith and spirituality', 'talk shows', 'talk show', 'thriller', 'lifestyle', 'black stories', 'talk show and variety']
Grupo 14: ['latino', 'classic', 'sci,fi', 'lifestyle', "kids'"]
Grupo 13: ['lgbtq', 'lgbtq+', 'cult']
Grupo 7: ['romantic 

In [206]:
!pip install spacy
!python -m spacy download en_core_web_sm


Collecting spacy
  Downloading spacy-3.7.4-cp311-cp311-win_amd64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.10-cp311-cp311-win_amd64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.8-cp311-cp311-win_amd64.whl.metadata (8.6 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp311-cp311-win_amd64.whl.metadata (2.2 kB)
Collecting thinc<8.3.0,>=8.2.2 (from spacy)
  Downloading thinc-8.2.3-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.2-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsly-2.4.8-cp311-cp311-win_amd64.

In [30]:
import pandas as pd
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Leer el archivo CSV con la columna
def leer_columna(nombre_archivo, gender_type):
    df = pd.read_csv('all_streaming.csv')
    return df[gender_type]

# Modificar el texto según las especificaciones dadas
def modificar_texto(texto):
    # Reemplazar "and" por ", "
    texto = texto.replace('and', ', ')
    # Añadir espacio después de cada coma
    texto = texto.replace(',', ', ')
    return texto

# Eliminar espacios antes de las comas y eliminar las comas
def eliminar_comas(texto):
    texto = texto.replace(' ,', ',')
    texto = texto.replace(',', '')
    return texto

# Calcular la similitud semántica entre dos palabras
def similitud_semantica(palabra1, palabra2):
    synsets1 = wordnet.synsets(palabra1)
    synsets2 = wordnet.synsets(palabra2)
    if synsets1 and synsets2:
        return max(synsets1[0].path_similarity(synsets2[0]), 0)
    else:
        return 0

# Filtrar palabras con similitud semántica mayor al 70%
def filtrar_palabras_similares(palabras):
    # Convertir palabras en minúsculas y eliminar stopwords
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    palabras_procesadas = [lemmatizer.lemmatize(word.lower()) for word in palabras if word.lower() not in stop_words]
    
    # Calcular la matriz TF-IDF
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(palabras_procesadas)
    
    # Calcular la similitud de coseno entre todas las palabras
    similitud_cos = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Crear lista de palabras similares
    palabras_similares = []
    for i in range(len(palabras_procesadas)):
        for j in range(i+1, len(palabras_procesadas)):
            if similitud_cos[i][j] > 0.7:
                palabras_similares.append((palabras[i], palabras[j]))
    
    return palabras_similares

# Leer la columna
columna = leer_columna("all_streaming.csv", "gender_type")

# Modificar el texto según las especificaciones dadas
columna_modificada = columna.apply(modificar_texto)

# Eliminar espacios antes de las comas y eliminar las comas
columna_procesada = columna_modificada.apply(eliminar_comas)

# Crear lista de palabras con similitud semántica mayor al 70%
palabras_similares = filtrar_palabras_similares(columna_procesada)


In [31]:
palabras_similares

[('comedy  st   up', 'comedy  st   up'),
 ('comedy  st   up', 'comedy  st   up'),
 ('comedy  st   up', 'comedy  news  st   up'),
 ('comedy  st   up', 'comedy  documentaries  st   up'),
 ('comedy  st   up', 'comedy  st   up'),
 ('comedy  st   up', 'comedy  st   up'),
 ('comedy  st   up', 'comedy  st   up'),
 ('comedy  st   up', 'comedy  news  st   up'),
 ('comedy  st   up', 'comedy  news  st   up'),
 ('comedy  st   up', 'comedy  latino  st   up'),
 ('comedy  st   up', 'comedy  st   up'),
 ('comedy  st   up', 'st  up comedy'),
 ('comedy  st   up', 'st  up comedy'),
 ('comedy  st   up', 'st  up comedy'),
 ('comedy  st   up', 'st  up comedy'),
 ('comedy  st   up', 'st  up comedy'),
 ('comedy  st   up', 'st  up comedy'),
 ('comedy  st   up', 'st  up comedy'),
 ('comedy  st   up', 'st  up comedy'),
 ('comedy  st   up', 'st  up comedy'),
 ('comedy  st   up', 'st  up comedy'),
 ('comedy  st   up', 'st  up comedy'),
 ('comedy  st   up', 'st  up comedy'),
 ('comedy  st   up', 'st  up comedy'),
 

In [1]:
import pandas as pd

# Definir función para leer el archivo CSV y filtrar palabras similares
def leer_columna(nombre_archivo):
    df = pd.read_csv(nombre_archivo)
    return df['gender_type']

def filtrar_palabras_similares(palabras):
    # Tu implementación actual para filtrar palabras con similitud semántica mayor al 70%
    pass

# Leer la columna 'gender_type' del archivo CSV
columna = leer_columna("all_streaming.csv")

# Obtener las parejas de palabras con similitud semántica mayor al 70%
palabras_similares = filtrar_palabras_similares(columna)

# Crear una nueva columna en el DataFrame
df = pd.read_csv("all_streaming.csv")
df['nueva_columna'] = ''

# Iterar sobre las parejas de palabras similares
for palabra1, palabra2 in palabras_similares:
    # Si la segunda palabra de la pareja está en 'gender_type', escribir la primera palabra en 'nueva_columna'
    df.loc[df['gender_type'] == palabra2, 'nueva_columna'] = palabra1

# Guardar el DataFrame modificado en un nuevo archivo CSV
df.to_csv("all_streaming_con_nueva_columna.csv", index=False)

TypeError: 'NoneType' object is not iterable