# Les bases de pandas 

In [109]:
import pandas as pd

In [110]:
# Chargement du dataset
df = pd.read_csv('films.csv', encoding='utf-8')

In [111]:
# Visualisation des 3 premières lignes 
df.head(3)

Unnamed: 0,title,original_title,score,type,country,public,date,time,description,actors
0,Les Évadés,The Shawshank Redemption,9.3,Drama,United States,Tous publics,1994,2h 22m,Two imprisoned men bond over a number of years...,"Tim Robbins,Morgan Freeman,Bob Gunton,William ..."
1,La Vie des autres,Das Leben der Anderen,8.4,Drama,"Germany,France",Tous publics,2006,2h 17m,"In 1984 East Berlin, an agent of the secret po...","Ulrich Mühe,Martina Gedeck,Sebastian Koch,Ulri..."
2,Le Dictateur,The Great Dictator,8.4,Comedy,United States,Tous publics,1940,2h 5m,Dictator Adenoid Hynkel tries to expand his em...,"Charles Chaplin,Paulette Goddard,Jack Oakie,Re..."


In [112]:
# Visualisation des 9 dernières lignes 
df.tail(9)

Unnamed: 0,title,original_title,score,type,country,public,date,time,description,actors
242,Le Seigneur des anneaux : La Communauté de l'a...,The Lord of the Rings: The Fellowship of the ...,8.8,Action,"New Zealand,United States",Tous publics,2001,2h 58m,A meek Hobbit from the Shire and eight compani...,"Elijah Wood,Ian McKellen,Orlando Bloom,Sean Be..."
243,Forrest Gump,Forrest Gump,8.8,Drama,United States,Tous publics,1994,2h 22m,"The presidencies of Kennedy and Johnson, the V...","Tom Hanks,Robin Wright,Gary Sinise,Sally Field..."
244,Pulp Fiction,Pulp Fiction,8.9,Crime,United States,12,1994,2h 34m,"The lives of two mob hitmen, a boxer, a gangst...","John Travolta,Uma Thurman,Samuel L. Jackson,Br..."
245,La liste de Schindler,Schindler's List,8.9,Biography,United States,Tous publics,1993,3h 15m,"In German-occupied Poland during World War II,...","Liam Neeson,Ralph Fiennes,Ben Kingsley,Carolin..."
246,12 Hommes en colère,12 Angry Men,9.0,Crime,United States,Tous publics,1957,1h 36m,The jury in a New York City murder trial is fr...,"Henry Fonda,Lee J. Cobb,Martin Balsam,John Fie..."
247,"Le Parrain, 2ᵉ partie",The Godfather: Part II,9.0,Crime,United States,13,1974,3h 22m,The early life and career of Vito Corleone in ...,"Al Pacino,Robert De Niro,Robert Duvall,Diane K..."
248,The Dark Knight : Le Chevalier noir,The Dark Knight,9.0,Action,"United States,United Kingdom",Tous publics,2008,2h 32m,When the menace known as the Joker wreaks havo...,"Christian Bale,Heath Ledger,Aaron Eckhart,Mich..."
249,Le Parrain,The Godfather,9.2,Crime,United States,12,1972,2h 55m,The Godfather follows Vito Corleone Don of the...,"Marlon Brando,Al Pacino,James Caan,Diane Keato..."
250,Le Seigneur des anneaux : Le Retour du roi,The Lord of the Rings: The Return of the King,8.9,Action,"New Zealand,United States",Tous publics,2003,3h 21m,Gandalf and Aragorn lead the World of Men agai...,"Elijah Wood,Viggo Mortensen,Ian McKellen,Orlan..."


In [113]:
# Exploration des valeurs manquantes et types de données
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251 entries, 0 to 250
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           251 non-null    object 
 1   original_title  251 non-null    object 
 2   score           251 non-null    float64
 3   type            251 non-null    object 
 4   country         251 non-null    object 
 5   public          251 non-null    object 
 6   date            251 non-null    int64  
 7   time            251 non-null    object 
 8   description     251 non-null    object 
 9   actors          251 non-null    object 
dtypes: float64(1), int64(1), object(8)
memory usage: 19.7+ KB


In [114]:
# Exploration des valeurs numériques
df.describe()

Unnamed: 0,score,date
count,251.0,251.0
mean,8.299203,1986.446215
std,0.255342,25.048435
min,6.5,1921.0
25%,8.1,1966.5
50%,8.2,1994.0
75%,8.4,2006.5
max,9.3,2021.0


In [115]:
# Remplacement d'une chaîne de caractères
time_temp = df['time'].str.replace(",", "").str.replace(" ", "")

In [116]:
# Extraction de regex
heures = time_temp.str.extract(r'(\d+)h')
minutes = time_temp.str.extract(r'h(\d+)')

# Modification du type de données
# le paramètre 'errors' permet de contourner des potentiels erreurs (NaN par exemple)
heures = heures.astype('float', errors='ignore')
minutes = minutes.astype('float', errors='ignore')

In [117]:
# Caster une opération (démo)
print('Avant:')
display(df['date'])
print('\nAprès:')
df['date'] + 1 * 8 / 5

Avant:


0      1994
1      2006
2      1940
3      2012
4      2000
       ... 
246    1957
247    1974
248    2008
249    1972
250    2003
Name: date, Length: 251, dtype: int64


Après:


0      1995.6
1      2007.6
2      1941.6
3      2013.6
4      2001.6
        ...  
246    1958.6
247    1975.6
248    2009.6
249    1973.6
250    2004.6
Name: date, Length: 251, dtype: float64

In [118]:
# Définir une nouvelle colonne et caster des éléments
df['time_cleaned'] = heures * 60 + minutes
display(df['time_cleaned'])

0      142.0
1      137.0
2      125.0
3      165.0
4      113.0
       ...  
246     96.0
247    202.0
248    152.0
249    175.0
250    201.0
Name: time_cleaned, Length: 251, dtype: float64

In [119]:
# Utilisation d'un masque
# Recherche des films dont l'un des pays d'origine est 'United States'
mask = df['country'].str.contains('United States')
df[mask]

Unnamed: 0,title,original_title,score,type,country,public,date,time,description,actors,time_cleaned
0,Les Évadés,The Shawshank Redemption,9.3,Drama,United States,Tous publics,1994,2h 22m,Two imprisoned men bond over a number of years...,"Tim Robbins,Morgan Freeman,Bob Gunton,William ...",142.0
2,Le Dictateur,The Great Dictator,8.4,Comedy,United States,Tous publics,1940,2h 5m,Dictator Adenoid Hynkel tries to expand his em...,"Charles Chaplin,Paulette Goddard,Jack Oakie,Re...",125.0
3,Django Unchained,Django Unchained,8.4,Drama,United States,12,2012,2h 45m,"With the help of a German bounty-hunter, a fre...","Jamie Foxx,Christoph Waltz,Leonardo DiCaprio,K...",165.0
4,Memento,Memento,8.4,Mystery,United States,Tous publics,2000,1h 53m,A man with short-term memory loss attempts to ...,"Guy Pearce,Carrie-Anne Moss,Joe Pantoliano,Mar...",113.0
5,Les Aventuriers de l'arche perdue,Raiders of the Lost Ark,8.4,Action,United States,Tous publics,1981,1h 55m,"In 1936, archaeologist and adventurer Indiana ...","Harrison Ford,Karen Allen,Paul Freeman,John Rh...",115.0
...,...,...,...,...,...,...,...,...,...,...,...
246,12 Hommes en colère,12 Angry Men,9.0,Crime,United States,Tous publics,1957,1h 36m,The jury in a New York City murder trial is fr...,"Henry Fonda,Lee J. Cobb,Martin Balsam,John Fie...",96.0
247,"Le Parrain, 2ᵉ partie",The Godfather: Part II,9.0,Crime,United States,13,1974,3h 22m,The early life and career of Vito Corleone in ...,"Al Pacino,Robert De Niro,Robert Duvall,Diane K...",202.0
248,The Dark Knight : Le Chevalier noir,The Dark Knight,9.0,Action,"United States,United Kingdom",Tous publics,2008,2h 32m,When the menace known as the Joker wreaks havo...,"Christian Bale,Heath Ledger,Aaron Eckhart,Mich...",152.0
249,Le Parrain,The Godfather,9.2,Crime,United States,12,1972,2h 55m,The Godfather follows Vito Corleone Don of the...,"Marlon Brando,Al Pacino,James Caan,Diane Keato...",175.0


In [120]:
# Utilisation d'un masque 
# Recherche des films ayant un score supérieur à 9
mask = df['score'] > 9
df_new = df[mask]
df_new

Unnamed: 0,title,original_title,score,type,country,public,date,time,description,actors,time_cleaned
0,Les Évadés,The Shawshank Redemption,9.3,Drama,United States,Tous publics,1994,2h 22m,Two imprisoned men bond over a number of years...,"Tim Robbins,Morgan Freeman,Bob Gunton,William ...",142.0
15,Soorarai Pottru,Soorarai Pottru,9.1,Drama,India,18,2020,2h 33m,"Nedumaaran Rajangam ""Maara"" sets out to make t...","Suriya,Paresh Rawal,Aparna Balamurali,Gaurav P...",153.0
249,Le Parrain,The Godfather,9.2,Crime,United States,12,1972,2h 55m,The Godfather follows Vito Corleone Don of the...,"Marlon Brando,Al Pacino,James Caan,Diane Keato...",175.0


In [121]:
# Utilisation d'un masque
# Recherche des films sortis en 1994
df[df['date'] == 1994]

Unnamed: 0,title,original_title,score,type,country,public,date,time,description,actors,time_cleaned
0,Les Évadés,The Shawshank Redemption,9.3,Drama,United States,Tous publics,1994,2h 22m,Two imprisoned men bond over a number of years...,"Tim Robbins,Morgan Freeman,Bob Gunton,William ...",142.0
214,Le Roi Lion,The Lion King,8.5,Animation,United States,Tous publics,1994,1h 28m,Lion prince Simba and his father are targeted ...,"Matthew Broderick,Jeremy Irons,James Earl Jone...",88.0
225,Léon,Léon,8.5,Action,"France,United States",12,1994,1h 50m,"Mathilda, a 12-year-old girl, is reluctantly t...","Jean Reno,Gary Oldman,Natalie Portman,Danny Ai...",110.0
243,Forrest Gump,Forrest Gump,8.8,Drama,United States,Tous publics,1994,2h 22m,"The presidencies of Kennedy and Johnson, the V...","Tom Hanks,Robin Wright,Gary Sinise,Sally Field...",142.0
244,Pulp Fiction,Pulp Fiction,8.9,Crime,United States,12,1994,2h 34m,"The lives of two mob hitmen, a boxer, a gangst...","John Travolta,Uma Thurman,Samuel L. Jackson,Br...",154.0


In [122]:
# Sélection de plusieurs colonnes
new_mask = ['title', 'original_title', 'score']
df_new[new_mask]

Unnamed: 0,title,original_title,score
0,Les Évadés,The Shawshank Redemption,9.3
15,Soorarai Pottru,Soorarai Pottru,9.1
249,Le Parrain,The Godfather,9.2


In [123]:
# score moyen par public
df.groupby('public')['score'].mean()

public
(Banned)                           8.250000
12                                 8.325806
12 avec avertissement              8.250000
13                                 8.400000
16                                 8.422222
16 avec avertissement              8.300000
18                                 8.525000
7                                  8.200000
Not Rated                          8.133333
PG                                 8.400000
R                                  8.100000
Tous Public                        8.080000
Tous publics                       8.295679
Tous publics avec avertissement    8.315789
Name: score, dtype: float64

In [124]:
# Export du Dataframe dans un nouveau fichier .csv
df.to_csv('films_cleaned.csv')