In [314]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

df = pd.read_csv("data/regressionData.csv").drop(['tconst', 'originalTitle'], axis=1)

X = df.drop(columns='averageRating')
y = df['averageRating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

mycols = ['genres', 'actor', 'director', 'writer', 'producer', 'actress']

num = X.select_dtypes(exclude=['object']).columns
cat = X.select_dtypes(include=['object']).columns

num_transformer = Pipeline(steps=[('imputer', SimpleImputer()),                                  ('discretizer', KBinsDiscretizer(encode='ordinal', strategy='uniform')),                                  ('scaler', MinMaxScaler())])

cat_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),                                  ('tfidf', TfidfVectorizer())])

preprocessor = ColumnTransformer(transformers=[('num', num_transformer, num),                                               ('cat', cat_transformer, cat)], remainder='passthrough')

pipe = Pipeline(steps=[('preprocessor', preprocessor),                       ('classifier', LogisticRegression(random_state=1, max_iter=10000))])

param_grid = dict(preprocessor__num__imputer__strategy=['mean', 'median'],
                  preprocessor__num__discretizer__n_bins=range(5, 10),
                  classifier__C=[0.1, 10, 100],
                  classifier__solver=['liblinear', 'saga'])

grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=10)
grid_search.fit(X_train, y_train)


KeyboardInterrupt: 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Read the data
df = pd.read_csv("data/regressionData.csv").drop(['tconst', 'originalTitle'], axis=1)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns='averageRating'), df['averageRating'], test_size=0.2, random_state=42)

# Define the column transformers
num_cols = X_train.select_dtypes(exclude=['object']).columns
cat_cols = X_train.select_dtypes(include=['object']).columns

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('discretizer', KBinsDiscretizer(encode='ordinal', strategy='uniform')),
    ('scaler', MinMaxScaler())])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('tfidf', TfidfVectorizer())])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)])

# Define the classifier
clf = LogisticRegression(max_iter=10000, random_state=1)

# Define the full pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', clf)])

# Define the hyperparameters to tune
params = {'preprocessor__cat__tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)],
          'classifier__C': [0.1, 1, 10, 100]}

# Define the grid search
grid_search = GridSearchCV(pipeline, param_grid=params, cv=5)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best hyperparameters and the score
print('Best hyperparameters:', best_params)
print('Best score:', best_score)

# Make predictions on the test set
y_pred = grid_search.predict(X_test)

# Convert the predicted class array to a dataframe
y_pred_df = pd.DataFrame(y_pred, columns=['predicted_class'])

# Concatenate the predicted class dataframe with the test dataframe
result_df = pd.concat([X_test.reset_index(drop=True), y_test.reset_index(drop=True), y_pred_df], axis=1)

# Print the result dataframe
print(result_df)


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, MinMaxScaler
from sklearn.compose import ColumnTransformer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

df = pd.read_csv("data/regressionData.csv").drop(['tconst', 'originalTitle'], axis=1)

X = df.drop(columns='averageRating')
y = df['averageRating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

num_cols = X.select_dtypes(exclude=['object']).columns
cat_cols = X.select_dtypes(include=['object']).columns
text_cols = ['genres', 'actor', 'director', 'producer', 'writer', 'actress']

num_transformer = Pipeline(steps=[('imputer', SimpleImputer()),
                                  ('discretizer', KBinsDiscretizer(encode='ordinal', strategy='uniform')),
                                  ('scaler', MinMaxScaler())])

cat_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                                  ('onehot', OneHotEncoder(handle_unknown='ignore'))])

text_transformer = Pipeline(steps=[('tfidf', TfidfVectorizer(lowercase=True, stop_words='english'))])

preprocessor = ColumnTransformer(transformers=[('num', num_transformer, num_cols),
                                               ('cat', cat_transformer, cat_cols),
                                               ('text', text_transformer, text_cols)])

pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', LogisticRegression(random_state=1, max_iter=10000))])

param_grid = dict(preprocessor__num__imputer__strategy=['mean', 'median'],
                  preprocessor__num__discretizer__n_bins=range(5, 10),
                  classifier__C=[0.1, 10, 100],
                  classifier__solver=['liblinear', 'saga'])

grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=10)
grid_search.fit(X_train, y_train)

# get best score and params
print(f"Best score: {grid_search.best_score_:.3f}")
print("Best params:", grid_search.best_params_)

# predict on test set
y_pred = grid_search.predict(X_test)
print("Accuracy on test set:", (y_test == y_pred).mean())


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, MinMaxScaler
from sklearn.compose import ColumnTransformer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

df = pd.read_csv("data/regressionData.csv").drop(['tconst', 'originalTitle'], axis=1)

X = df.drop(columns='averageRating')
y = df['averageRating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#num_cols = X.select_dtypes(exclude=['object']).columns
#cat_cols = X.select_dtypes(include=['object']).columns
text_cols = ['genres', 'actor', 'director', 'producer', 'writer', 'actress']

#num_transformer = Pipeline(steps=[('imputer', SimpleImputer()),
#                                  ('discretizer', KBinsDiscretizer(encode='ordinal', strategy='uniform')),
#                                  ('scaler', MinMaxScaler())])

#cat_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
#                                  ('onehot', OneHotEncoder(handle_unknown='ignore'))])

text_transformer = Pipeline(steps=[('tfidf', TfidfVectorizer(lowercase=True, stop_words='english'))])

preprocessor = ColumnTransformer(transformers=[('text', text_transformer, text_cols)])
                                               

pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', LogisticRegression(random_state=1, max_iter=10000))])

param_grid = dict(preprocessor__num__imputer__strategy=['mean', 'median'],
                  preprocessor__num__discretizer__n_bins=range(5, 10),
                  classifier__C=[0.1, 10, 100],
                  classifier__solver=['liblinear', 'saga'])

grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=10)
grid_search.fit(X_train, y_train)

# get best score and params
print(f"Best score: {grid_search.best_score_:.3f}")
print("Best params:", grid_search.best_params_)

# predict on test set
y_pred = grid_search.predict(X_test)
print("Accuracy on test set:", (y_test == y_pred).mean())

In [318]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

df = pd.read_csv("data/regressionData.csv").drop(['tconst', 'originalTitle'], axis=1)

X = df.drop(columns='averageRating')
y = df['averageRating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

text_cols = ['genres', 'actor', 'director', 'producer', 'writer', 'actress']
X_train[text_cols] = X_train[text_cols].fillna('')
X_test[text_cols] = X_test[text_cols].fillna('')

text_transformer = TfidfVectorizer(lowercase=True, stop_words='english')
X_train_text = text_transformer.fit_transform(X_train[text_cols].apply(lambda x: ' '.join(x), axis=1))
X_test_text = text_transformer.transform(X_test[text_cols].apply(lambda x: ' '.join(x), axis=1))

clf = LogisticRegression(random_state=1, max_iter=10000)
clf.fit(X_train_text, y_train)

print("Accuracy on test set:", clf.score(X_test_text, y_test))


Accuracy on test set: 0.38433045432202795


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


SVR 

In [7]:
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVR

df = pd.read_csv("data/data.csv").drop(['tconst'], axis=1)

X = df.drop(columns='averageRating')
y = df['averageRating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

text_cols = ['genres', 'actor', 'director', 'producer', 'writer', 'actress']
X_train[text_cols] = X_train[text_cols].fillna('')
X_test[text_cols] = X_test[text_cols].fillna('')

text_transformer = TfidfVectorizer(lowercase=True, stop_words='english')
X_train_text = text_transformer.fit_transform(X_train[text_cols].apply(lambda x: ' '.join(x), axis=1))
X_test_text = text_transformer.transform(X_test[text_cols].apply(lambda x: ' '.join(x), axis=1))

# Define the parameter grid to search
param_grid = {
    'C': [10],
    'kernel': ['linear'],
    'degree': [2],
}
#0.01, 0.1, 100
#, 3,,, ,, 4 
# sigmoid',  'rbf' , 'poly'
# Create a GridSearchCV object
clf = GridSearchCV(SVR(), param_grid, n_jobs=-1)

# Fit the GridSearchCV object to the training data
clf.fit(X_train_text, y_train)

print(f"Best parameters: {clf.best_params_}")
print(f"Best score: {clf.best_score_}")
print("Accuracy on test set:", clf.score(X_test_text, y_test))

KeyboardInterrupt: 

REGRESSION LOGISTIQUE

In [14]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib

# Chargement des données
df = pd.read_csv("data/regression.csv").drop(['tconst'], axis=1)

# Séparation des données en ensembles d'entraînement et de test
X = df.drop(columns='averageRating')
y = df['averageRating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Définition du pipeline
text_cols = ['originalTitle', 'sumVotes','genres', 'actor', 'director', 'producer', 'writer', 'actress']
X_train[text_cols] = X_train[text_cols].fillna('')
X_test[text_cols] = X_test[text_cols].fillna('')

text_transformer = TfidfVectorizer(lowercase=True, stop_words='english')
clf = LogisticRegression(random_state=1, max_iter=10000)

pipeline = Pipeline([
    ('text_transform', text_transformer),
    ('clf', clf)
])

# Définition de la grille de recherche pour la régression logistique
param_grid = {
    'text_transform__max_features': [1000],
    'clf__C': [10],
    'clf__penalty': ['l2']
}
#, 1000, 2000,,,, 10, 100

# Recherche des meilleurs hyperparamètres en utilisant la validation croisée
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5)
grid_search.fit(X_train[text_cols].apply(lambda x: ' '.join(x), axis=1), y_train)

# Évaluation du modèle sur l'ensemble de test
print("Accuracy on test set:", grid_search.score(X_test[text_cols].apply(lambda x: ' '.join(x), axis=1), y_test))
joblib.dump(clf, 'data/pop.pkl')

KeyboardInterrupt: 

SVR

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

# chargement des données
df = pd.read_csv("data/data.csv").drop(['tconst'], axis=1)

# décomposition des données en X et y
X = df.drop(columns='averageRating')
y = df['averageRating']

# séparation des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# spécification des colonnes de texte pour la vectorisation
text_cols = ['genres', 'actor', 'director', 'producer', 'writer', 'actress']
X_train[text_cols] = X_train[text_cols].fillna('')
X_test[text_cols] = X_test[text_cols].fillna('')
# spécification des transformations à appliquer à chaque colonne
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(lowercase=True, stop_words='english'), text_cols)
    ]
)

# déclaration du modèle
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', SVR())
])

# définition de la grille de recherche
param_grid = {
    'regressor__kernel': ['linear', 'rbf'],
    'regressor__C': [0.1, 1, 10],
    'regressor__epsilon': [0.01, 0.1, 1],
}

# exécution de la recherche sur la grille
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# évaluation des performances sur l'ensemble de test
print("R^2 on test set:", grid_search.score(X_test, y_test))


RANDOM FOREST REGRESSOR

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor

df = pd.read_csv("data/regressionData.csv").drop(['tconst', 'originalTitle'], axis=1)

X = df.drop(columns='averageRating')
y = df['averageRating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

text_cols = ['genres', 'actor', 'director', 'producer', 'writer', 'actress']
X_train[text_cols] = X_train[text_cols].fillna('')
X_test[text_cols] = X_test[text_cols].fillna('')

text_transformer = TfidfVectorizer(lowercase=True, stop_words='english')
X_train_text = text_transformer.fit_transform(X_train[text_cols].apply(lambda x: ' '.join(x), axis=1))
X_test_text = text_transformer.transform(X_test[text_cols].apply(lambda x: ' '.join(x), axis=1))

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

regressor = RandomForestRegressor(random_state=1)
grid_search = GridSearchCV(regressor, param_grid, cv=5)
grid_search.fit(X_train_text, y_train)

print("Best parameters:", grid_search.best_params_)
print("Accuracy on test set:", grid_search.score(X_test_text, y_test))


In [None]:
# Importer les bibliothèques nécessaires
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Charger le jeu de données des films dans un dataframe
df = pd.read_csv("data/clusteringData.csv")

# Sélectionner les variables pertinentes pour le clustering
selected_cols = ['actor', 'director', 'writer', 'actress', 'averageRating', 'numVotes']
X = df[selected_cols]

# Standardiser les variables numériques
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Appliquer l'algorithme de clustering K-means
kmeans = KMeans(n_clusters=3, random_state=0)
kmeans.fit(X_scaled)

# Ajouter la colonne de cluster assigné à chaque film dans le dataframe
df['cluster'] = kmeans.labels_

# Afficher les statistiques des clusters
df.groupby('cluster').mean()

# Prédire la popularité d'un nouveau film
new_film = pd.DataFrame({'acteur': ['Nom de l\'acteur'], 'directeur': ['Nom du directeur'], 
                         'scenariste': ['Nom du scenariste'], 'actrice': ['Nom de l\'actrice'], 
                         'note_sur_dix': [7.5], 'nombre_de_vote': [5000]})

# Standardiser les variables du nouveau film
new_film_scaled = scaler.transform(new_film)

# Prédire le cluster pour le nouveau film
cluster = kmeans.predict(new_film_scaled)
print("Le nouveau film appartient au cluster", cluster)

In [3]:
df = pd.read_csv("data/clusteringData.csv")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284818 entries, 0 to 284817
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          284818 non-null  object 
 1   runtimeMinutes  257511 non-null  float64
 2   genres          275203 non-null  object 
 3   averageRating   284818 non-null  int64  
 4   numVotes        284818 non-null  int64  
 5   actor           245555 non-null  object 
 6   actress         219321 non-null  object 
 7   director        267224 non-null  object 
 8   producer        161833 non-null  object 
 9   writer          165226 non-null  object 
dtypes: float64(1), int64(2), object(7)
memory usage: 21.7+ MB


In [None]:
code enlevé de model.py

def concat_features(row):
    return (str(row['genres']).replace(",", " ") + " " + str(row['director']).replace(" ", "").replace(",", "") + " " + str(row["actor"]).replace(" ", "").replace(",", "") + " " + str(row["actress"]).replace(" ", "").replace(",", "") + " " + str(row["producer"]).replace(" ", "").replace(",", " ") + " " + str(row["writer"]).replace(" ", "").replace(",", " "))


def concat_features(row):
    return (str(row['director']).replace(",", ""))


df['director'] = df.apply(concat_features, axis=1)


def concat_features(row):
    return (str(row["actor"]).replace(",", " "))


df['actor'] = df.apply(concat_features, axis=1)


def concat_features(row):
    return (str(row["actress"]).replace(",", ""))


df['actress'] = df.apply(concat_features, axis=1)


def concat_features(row):
    return (str(row["producer"]).replace(",", ""))


df['producer'] = df.apply(concat_features, axis=1)


def concat_features(row):
    return (str(row["writer"]).replace(",", ""))


df['writer'] = df.apply(concat_features, axis=1)

In [None]:
from scipy.stats import pointbiserialr
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from scipy.stats import chi2_contingency
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Charger le jeu de données des films dans un dataframe
df = pd.read_csv("data/clusteringData.csv")

df = df.dropna()



# Sélection des colonnes pertinentes
selected_cols = ['actor', 'averageRating']
df = df[selected_cols]

# Convertir la colonne "averageRating" en variable numérique
df['averageRating'] = pd.to_numeric(df['averageRating'], errors='coerce')

def concat_features(row):
    return (str(row['actor']).replace(" ", "").replace(",", ""))
df["actor"] = df.apply(concat_features, axis=1)

# Création d'une colonne avec une liste de catégories pour chaque ligne
df["actor"] = df["actor"].str.split(",")

# Encodage des catégories avec LabelEncoder
le = LabelEncoder()
df["categories_encoded"] = df["actor"].apply(le.fit_transform)

# Affichage des correspondances entre les catégories originales et les valeurs encodées
cat_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(cat_mapping)

# Suppression de la colonne originale avec les listes de catégories
data = df.drop("actor", axis=1)
freq = df['averageRating'].value_counts()
data = np.array([freq.values])

obs =  np.vstack([df['averageRating'], data])


# Test du Chi2
chi2, pval, dof, exp = chi2_contingency(obs)

# Affichage des résultats
print("Statistique du test du Chi2 :", chi2)
print("p-value :", pval)

# Calculer le coefficient de corrélation entre la note et les acteurs
#corr, p_value = pointbiserialr(df['averageRating'], count_matrix_svd_norm.sum(axis=1))
#print("Le coefficient de corrélation est de", corr, "avec une p-value de", p_value)


In [None]:
import pandas as pd
from scipy.stats import f_oneway

# Charger les données dans un DataFrame
df = pd.read_csv("data/clusteringData.csv")

# Convertir la colonne catégorielle en variables binaires à l'aide de l'encodage one-hot
dummies = pd.get_dummies(df['actor'])

# Ajouter les variables binaires au DataFrame
df = pd.concat([df, dummies], axis=1)

# Grouper les données par catégorie et effectuer l'ANOVA sur chaque groupe
for categorie in dummies.columns:
    group = df[df[categorie] == 1]['averageRating']
    stat, p = f_oneway(group)
    print(f"Catégorie : {categorie}, Statistique F : {stat}, valeur de p : {p}")

In [7]:
!pip install statsmodels

Defaulting to user installation because normal site-packages is not writeable


In [36]:
import pandas as pd

df = pd.read_csv("data/data.csv")

df= df.dropna()
df.shape



df

Unnamed: 0,tconst,runtimeMinutes,genres,averageRating,numVotes,actor,actress,director,producer,writer
0,tt0001790,60.0,Drama,6,50,Henry Krauss,Maria Ventura,Albert Capellani,Pierre Decourcelle,Paul Capellani
1,tt0001911,50.0,Biography,4,24,Augustus Neville,Nellie Stewart,Raymond Longford,George Musgrove,Mrs. Charles A. Doremus
2,tt0002423,85.0,Biography,7,921,Emil Jannings,Pola Negri,Ernst Lubitsch,Paul Davidson,Norbert Falk
3,tt0002452,120.0,History,6,249,Aristide Demetriade,Constanta Demetriade,Grigore Brezeanu,Leon Popescu,Petre Liciu
4,tt0002605,300.0,Adventure,6,45,Tom Santschi,Kathlyn Williams,Francis J. Grandon,William Nicholas Selig,Harold McGrath
...,...,...,...,...,...,...,...,...,...,...
75181,tt9913084,75.0,Documentary,7,48,Luciano Scarpa,Claudia Stecher,Giancarlo Soldi,Maite Bulgari Carpio,Mario Gomboli
75182,tt9914192,98.0,Comedy,5,279,Maurício Manfrini,Cacau Protásio,Roberto Santucci,André Carreira,Paulo Cursino
75183,tt9916270,84.0,Thriller,6,1392,Sergio Castellitto,Anna Foglietta,Giacomo Cimini,Isabella Cocuzza,Lorenzo Collalti
75184,tt9916362,92.0,Drama,6,5175,Alex Brendemühl,Amaia Aberasturi,Pablo Agüero,Iker Ganuza,Katell Guillou


In [42]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df['actor'])
le.classes_
actor = le.transform(df['actor'])
actor

array([11700,  2792,  8189, ..., 27363,  1042, 26586])

In [43]:
cols_to_transform = ['actor', 'director', 'genres', 'writer', 'producer', 'actress']
le = preprocessing.LabelEncoder()

for col in cols_to_transform:
    le.fit(df[col])
    df[col] = df[col].map(lambda x: le.transform([x])[0])
    
df

In [37]:


def anovaGlobal(colonneQualitative, dataframe):
    from statsmodels.formula.api import ols
    from statsmodels.stats.anova import anova_lm
    
    
    resultatAnova = []
    dfGroupby = dataframe.groupby([colonneQualitative]).mean()
    
    listeColonnesQuantitatives = dfGroupby.columns
    dfGroupby.reset_index(inplace=True)
    
    for colonne in listeColonnesQuantitatives:
        # ols(colonneQuantitative ~ colonneQualitative)

        model = ols(f'{colonne} ~ {colonneQualitative}', data=dfGroupby)
        entrainement = model.fit()
        
        st_p = anova_lm(entrainement).loc[colonneQualitative, "PR(>F)"]
        if st_p > 0.05:
            resultatAnova.append([colonneQualitative, colonne, "KO", st_p])
            #print(f"Il n'y pas pas de lien entre {colonneQualitative} et {colonne}")
        elif st_p < 0.05:
            resultatAnova.append([colonneQualitative, colonne, "OK", st_p])
            #print(f"Il y a un lien entre {colonneQualitative} et {colonne}")
        #print(f"La p-value vaut {st_p}")
    return pd.DataFrame(resultatAnova, columns=[colonneQualitative, "averageRating", "Résultat Anova", "P-value"])

In [40]:
anovaGlobal(actor, df)

  dfGroupby = dataframe.groupby([colonneQualitative]).mean()


SyntaxError: invalid syntax. Perhaps you forgot a comma? (<unknown>, line 1)

In [6]:
import pandas as pd
import numpy as np
import os 
import matplotlib.pyplot as plt




df = pd.read_csv('data/similarity_0.csv')
df.shape
df.columns
#df.info()
#dfhead=df.head(20)



df1=df[['tconst', 'genres']]
df1
df1.shape #(284818, 2)
#df1.info()
df1.dropna() #[275203 rows x 2 columns]
#df1.info()
#df1.head(20)

df1_0 = df1.groupby('genres').size().sort_values(ascending=False).reset_index().rename(columns = {0: "number of occurence"})
df1_1 = df1_0.rename(columns = {0: "number of recurence"})
df1_1

x_genres = df1_0['genres']
y_genres = df1_0['number of recurence']
xlabel = df1_0['genres'] 

plt.bar(x_genres, y_genres, width = 0.6)
plt.xticks(xlabel, rotation=90)
plt.xlabel('genres')
plt.ylabel('number of occurence')
plt.title('genres le plus représenté')
plt.show()



KeyError: 'number of recurence'

In [None]:
#films notés et non noté
dfnote=df[['averageRating', 'tconst']]
dfnote
df2=dfnote.groupby('averageRating').size().sort_values(ascending=False).reset_index().rename(columns = {0:"number of occurence"})
ar = np.array([[0, 350024]])
dfnonote = pd.DataFrame(ar, index = ['10'], columns = ['averageRating', 'number of occurence'])
dfnonote.info()
pieces = {"x": df2, "y": dfnonote}
result = pd.concat(pieces)
result
result.info()
result = result.astype(('Int64'))
x_note = result['averageRating']
y_note = result['number of occurence']
xlabel = result['averageRating'] 
if x_note==0:
   x_note=='nan'
else:
    x_note==x_note
plt.bar(x_note, y_note, width = 0.6)
plt.xticks(xlabel, rotation=90)
plt.xlabel('ratings')
plt.ylabel('number of occurence')
plt.title('distribution des notes')
plt.show()


In [19]:
import pandas as pd
import json

# Charger les données à partir d'un fichier CSV
data = pd.read_csv('data/regression.csv')
data = data.drop(['runtimeMinutes', 'tconst'], axis=1)

data.head()

# Convertir les colonnes acteur, producteur, actrices, directeurs et genre en une structure de tableau
data['actor'] = data['actor'].apply(lambda x: json.loads(x) if x else [])
data['producer'] = data['producer'].apply(lambda x: json.loads(x) if x else [])
data['actress'] = data['actress'].apply(lambda x: json.loads(x) if x else [])
data['director'] = data['director'].apply(lambda x: json.loads(x) if x else [])
data['genres'] = data['genres'].apply(lambda x: json.loads(x) if x else [])
data['writer'] = data['writer'].apply(lambda x: json.loads(x) if x else [])

# Concaténer les listes de toutes les colonnes en une seule liste
actors = sum(data['actor'], [])
producers = sum(data['producer'], [])
actresses = sum(data['actress'], [])
directors = sum(data['director'], [])
genres = sum(data['genres'], [])
writers = sum(data['writer'], [])
# Créer des listes uniques pour chaque catégorie
unique_actors = list(set(actors))
unique_producers = list(set(producers))
unique_actresses = list(set(actresses))
unique_directors = list(set(directors))
unique_genres = list(set(genres))
unique_writers = list(set(writers))


print('Nombre d\'acteurs uniques :', len(unique_actors))
print('Nombre de producteurs uniques :', len(unique_producers))
print('Nombre d\'actrices uniques :', len(unique_actresses))
print('Nombre de directeurs uniques :', len(unique_directors))
print('Nombre de genres uniques :', len(unique_genres))
print('Nombre de writers uniques :', len(unique_writers))

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
# Normaliser les données
X = data.drop(['note'], axis=1)
X = (X - X.mean()) / X.std()

# Sélectionner la colonne cible (note)
y = data['note']

# Créer un objet KMeans
kmeans = KMeans()

# Créer une grille de paramètres pour la recherche de grille
param_grid = {'n_clusters': [2, 3, 4, 5, 6],
              'max_iter': [100, 200, 300, 400, 500]}

# Créer un objet GridSearchCV
grid_search = GridSearchCV(kmeans, param_grid, cv=5)

# Exécuter la recherche de grille sur les données
grid_search.fit(X, y)

# Afficher les meilleurs hyperparamètres et la meilleure score
print('Meilleurs hyperparamètres :', grid_search.best_params_)
print('Meilleure score :', grid_search.best_score_)

In [21]:
from sklearn.preprocessing import MultiLabelBinarizer

data.fillna({'actor': [], 'actress': [], 'producer': [], 'director': [], 'genres': [], 'writer': []}, inplace=True)

data['actor'] = data['actor'].astype(str)
data['actress'] = data['actress'].astype(str)
data['producer'] = data['producer'].astype(str)
data['director'] = data['director'].astype(str)
data['genres'] = data['genres'].astype(str)
data['writer'] = data['writer'].astype(str)

# Créer une instance de MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Binariser les colonnes 'actor', 'actress', 'producer', 'director' et 'genre'
actors = mlb.fit_transform(data['actor'])
actresses = mlb.transform(data['actress'])
producers = mlb.transform(data['producer'])
directors = mlb.transform(data['director'])
genres = mlb.fit_transform(data['genres'])
writers = mlb.fit_transform(data['writer'])

# Concaténer les matrices binaires avec les autres colonnes de votre dataframe
data = pd.concat([data.drop(['actor', 'actress', 'producer', 'director', 'genres'], axis=1), 
                  pd.DataFrame(actors, columns=mlb.classes_).add_prefix('actor_'), 
                  pd.DataFrame(actresses, columns=mlb.classes_).add_prefix('actress_'),
                  pd.DataFrame(producers, columns=mlb.classes_).add_prefix('producer_'),
                  pd.DataFrame(directors, columns=mlb.classes_).add_prefix('director_'),
                  pd.DataFrame(genres, columns=mlb.classes_).add_prefix('genre_'),
                  pd.DataFrame(writers, columns=mlb.classes_).add_prefix('writer_')], axis=1)

data

TypeError: "value" parameter must be a scalar or dict, but you passed a "list"

In [30]:
from sklearn.preprocessing import MultiLabelBinarizer

fill_values = {'actor': '', 'actress': '', 'producer': '', 'director': '', 'genres': '', 'writer': ''}
data.fillna(value=fill_values, inplace=True)
data['actor'] = data['actor'].astype(str)
data['actress'] = data['actress'].astype(str)
data['producer'] = data['producer'].astype(str)
data['director'] = data['director'].astype(str)
data['genres'] = data['genres'].astype(str)
data['writer'] = data['writer'].astype(str)

# Binariser les colonnes 'actor', 'actress', 'producer', 'director' et 'genre'
mlb = MultiLabelBinarizer()
actors = mlb.fit_transform(data['actor'])
actresses = mlb.transform(data['actress'])
producers = mlb.transform(data['producer'])
directors = mlb.transform(data['director'])
genres = mlb.fit_transform(data['genres'])
writers = mlb.fit_transform(data['writer'])

# Concaténer les colonnes binarisées
data_transformed = np.hstack((actors, actresses, producers, directors, genres, writers))

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

# Concaténer les colonnes en une seule chaîne de caractères
data['text'] = data['originalTitle'] + ' ' + data['genres'] + ' ' + data['numVotes'] + ' ' + data['actor'] + ' ' + data['actress'] + ' ' + data['director'] + ' ' + data['producer'] + ' ' + data['writer'].astype(str)

# Créer la matrice de caractéristiques en utilisant TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['text'])

# Effectuer le clustering K-Means avec 5 clusters
kmeans = KMeans(n_clusters=5)
kmeans.fit(X)

# Assigner les clusters à chaque ligne
data['cluster'] = kmeans.labels_




TypeError: can only concatenate str (not "int") to str

In [32]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Charger les données à partir d'un fichier CSV
data = pd.read_csv('data/regression.csv')
data = data.drop(['runtimeMinutes', 'tconst'], axis=1)
data['averageRating'] = data['averageRating'].astype(float)

# Remplir les valeurs manquantes et binariser les colonnes pertinentes
fill_values = {'actor': '', 'actress': '', 'producer': '', 'director': '', 'genres': '', 'writer': ''}
data.fillna(value=fill_values, inplace=True)
data['actor'] = data['actor'].astype(str)
data['actress'] = data['actress'].astype(str)
data['producer'] = data['producer'].astype(str)
data['director'] = data['director'].astype(str)
data['genres'] = data['genres'].astype(str)
data['writer'] = data['writer'].astype(str)

# Binariser les colonnes 'actor', 'actress', 'producer', 'director' et 'genre'
mlb = MultiLabelBinarizer()
actors = mlb.fit_transform(data['actor'])
actresses = mlb.transform(data['actress'])
producers = mlb.transform(data['producer'])
directors = mlb.transform(data['director'])
genres = mlb.fit_transform(data['genres'])
writers = mlb.fit_transform(data['writer'])

# Concaténer les colonnes binarisées
data_transformed = np.hstack((actors, actresses, producers, directors, genres, writers))

# Concaténer les colonnes en une seule chaîne de caractères
data['text'] = data['originalTitle'] + ' ' + data['genres'] + ' ' + data['numVotes'].astype(str) + ' ' + data['actor'] + ' ' + data['actress'] + ' ' + data['director'] + ' ' + data['producer'] + ' ' + data['writer'].astype(str)

# Créer la matrice de caractéristiques en utilisant TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['text'])

# Effectuer le clustering K-Means avec 5 clusters
kmeans = KMeans(n_clusters=5)
kmeans.fit(X)

# Assigner les clusters à chaque ligne
data['cluster'] = kmeans.labels_

# Sélectionner les caractéristiques pertinentes pour la prédiction
features = data_transformed

# Ajouter la colonne de cluster aux caractéristiques
features = np.hstack((features, np.array(data['cluster']).reshape(-1, 1)))

# Sélectionner la colonne de variable cible (averageRating)
target = np.array(data['averageRating'])

# Diviser les données en ensembles de formation et de test
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Créer le modèle de régression linéaire et l'entra
# Créer une instance de modèle de régression linéaire
model = LinearRegression()

# Entraîner le modèle sur les données d'entraînement
model.fit(X_train, y_train)

# Faire des prédictions sur les données de test
y_pred = model.predict(X_test)

# Calculer l'erreur quadratique moyenne (MSE) des prédictions
mse = mean_squared_error(y_test, y_pred)

# Afficher le MSE
print('MSE:', mse)




: 

: 

In [5]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib

# Chargement des données
df = pd.read_csv("data/regression.csv").drop(['tconst'], axis=1)

# Séparation des données en ensembles d'entraînement et de test
X = df.drop(columns='averageRating')
y = df['averageRating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Définition du pipeline
text_cols = ['genres', 'actor', 'director', 'producer', 'writer', 'actress']
X_train[text_cols] = X_train[text_cols].fillna('')
X_test[text_cols] = X_test[text_cols].fillna('')

text_transformer = TfidfVectorizer(lowercase=False, stop_words='english', max_features=1000)
clf = LogisticRegression(random_state=1, max_iter=10000, C=10, penalty='l2')

pipeline = Pipeline([
    ('text_transform', text_transformer),
    ('clf', clf)
])

# Entraînement du modèle avec les meilleurs hyperparamètres
pipeline.fit(X_train[text_cols].apply(lambda x: ' '.join(x), axis=1), y_train)

# Évaluation du modèle sur l'ensemble de test
print("Accuracy on test set:", pipeline.score(X_test[text_cols].apply(lambda x: ' '.join(x), axis=1), y_test))

# Enregistrement du modèle
joblib.dump(pipeline, 'data/pop2.pkl')

Accuracy on test set: 0.3467102029351871


['data/pop2.pkl']

In [7]:
X_train.values[0]

array(['Amexico', 84.0, 'Drama', 61, 'Roberto Enrique, Richard Gleason',
       'Maricela Ochoa, Olivia Pena', 'Glenn Robert Smith',
       'Shlomo Buchler, Gregg R. Simpson', 'Glenn Robert Smith'],
      dtype=object)

In [8]:
pipeline.predict(X_train.values[0])

AttributeError: 'float' object has no attribute 'lower'

In [4]:
import pandas as pd
from django.shortcuts import render
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib

# Chargement des données
df = pd.read_csv("data/regression.csv").drop(['tconst'], axis=1)

# Définition du pipeline
text_cols = ['genres', 'actor', 'director', 'producer', 'writer', 'actress']
text_transformer = TfidfVectorizer(lowercase=True, stop_words='english', max_features=1000)
list_transformer = FunctionTransformer(lambda x: [' '.join(x)], validate=False)
clf = LogisticRegression(random_state=1, max_iter=10000, C=10, penalty='l2')
df[text_cols] = df[text_cols].astype(str)

pipeline = Pipeline([
    ('list_transform', list_transformer),
    ('text_transform', text_transformer),
    ('clf', clf)
])

# Entraînement du modèle avec les meilleurs hyperparamètres
pipeline.fit(df[text_cols].apply(lambda x: ' '.join(x), axis=1), df['averageRating'])

# Enregistrement du modèle
joblib.dump(pipeline, 'data/pop3.pkl')


ValueError: Found input variables with inconsistent numbers of samples: [1, 284818]