### Movies dataset 

In [1]:
import pandas as pd

In [42]:
users = pd.read_csv('ml-1m/users.dat',
                    sep='::', 
                    engine="python", 
                    header=None, 
                    names=['UserID','Gender','Age','Occupation','Zip-code']
                   )
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [88]:
movies = pd.read_csv('ml-1m/movies.dat', 
                     sep='::', 
                     engine="python", 
                     header=None, 
                     names=['MovieID','Title','Genres']
                    )
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [44]:
ratings = pd.read_csv('ml-1m/ratings.dat', 
                      sep='::', 
                      engine="python", 
                      header=None, 
                      names=['UserID','MovieID','Rating','Timestamp']
                     )
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


### Get data from string

In [89]:
movies.Title.str.extract(r"\((\d{4})\)$").head()

Unnamed: 0,0
0,1995
1,1995
2,1995
3,1995
4,1995


In [90]:
movies['Year'] = movies.Title.str[-5:-1]
movies['Year_int'] = movies.Title.str[-5:-1]
movies.Year_int = movies.Year_int.astype('int')
movies.Year = pd.to_datetime(movies['Year'])
movies.head()

Unnamed: 0,MovieID,Title,Genres,Year,Year_int
0,1,Toy Story (1995),Animation|Children's|Comedy,1995-01-01,1995
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1995-01-01,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995-01-01,1995
3,4,Waiting to Exhale (1995),Comedy|Drama,1995-01-01,1995
4,5,Father of the Bride Part II (1995),Comedy,1995-01-01,1995


In [91]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 5 columns):
MovieID     3883 non-null int64
Title       3883 non-null object
Genres      3883 non-null object
Year        3883 non-null datetime64[ns]
Year_int    3883 non-null int32
dtypes: datetime64[ns](1), int32(1), int64(1), object(2)
memory usage: 136.6+ KB


In [92]:
movies.Title = movies.Title.str[:-7]

In [93]:
movies.head()

Unnamed: 0,MovieID,Title,Genres,Year,Year_int
0,1,Toy Story,Animation|Children's|Comedy,1995-01-01,1995
1,2,Jumanji,Adventure|Children's|Fantasy,1995-01-01,1995
2,3,Grumpier Old Men,Comedy|Romance,1995-01-01,1995
3,4,Waiting to Exhale,Comedy|Drama,1995-01-01,1995
4,5,Father of the Bride Part II,Comedy,1995-01-01,1995


### How many movies in dataframe

In [94]:
len(movies)

3883

In [96]:
movies.shape[0]

3883

### How many movies by year

In [98]:
movies.groupby('Year').count().tail()

Unnamed: 0_level_0,MovieID,Title,Genres,Year_int
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1996-01-01,345,345,345,345
1997-01-01,315,315,315,315
1998-01-01,337,337,337,337
1999-01-01,283,283,283,283
2000-01-01,156,156,156,156


In [114]:
movies.groupby('Year_int').size().reset_index().sort_values(by=0,ascending=False).head()

Unnamed: 0,Year_int,0
76,1996,345
75,1995,342
78,1998,337
77,1997,315
79,1999,283


In [118]:
movies.set_index('Year').resample('10A').size()

Year
1919-12-31       3
1929-12-31      34
1939-12-31      77
1949-12-31     126
1959-12-31     168
1969-12-31     191
1979-12-31     247
1989-12-31     598
1999-12-31    2283
2009-12-31     156
Freq: 10A-DEC, dtype: int64

### Gender in users

In [120]:
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [140]:
users.pivot_table(columns= 'Gender', index='Age', aggfunc='count')[['Occupation']]

Unnamed: 0_level_0,Occupation,Occupation
Gender,F,M
Age,Unnamed: 1_level_2,Unnamed: 2_level_2
1,78,144
18,298,805
25,558,1538
35,338,855
45,189,361
50,146,350
56,102,278


In [151]:
users2 = users.groupby(['Gender','Age']).size().reset_index()
users2

Unnamed: 0,Gender,Age,0
0,F,1,78
1,F,18,298
2,F,25,558
3,F,35,338
4,F,45,189
5,F,50,146
6,F,56,102
7,M,1,144
8,M,18,805
9,M,25,1538


In [152]:
users2.pivot(index='Age',columns='Gender',values=0)

Gender,F,M
Age,Unnamed: 1_level_1,Unnamed: 2_level_1
1,78,144
18,298,805
25,558,1538
35,338,855
45,189,361
50,146,350
56,102,278


In [137]:
users.Gender.value_counts()

M    4331
F    1709
Name: Gender, dtype: int64

In [139]:
pd.crosstab(users.Gender,users.Age)

Age,1,18,25,35,45,50,56
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
F,78,298,558,338,189,146,102
M,144,805,1538,855,361,350,278


### Most common genre

In [158]:
movies.Genres.values.flatten()

array(["Animation|Children's|Comedy", "Adventure|Children's|Fantasy",
       'Comedy|Romance', ..., 'Drama', 'Drama', 'Drama|Thriller'],
      dtype=object)

In [173]:
movies.Genres.str.split('|', expand=True).melt().dropna().value.value_counts().reset_index().head()

Unnamed: 0,index,value
0,Drama,1603
1,Comedy,1200
2,Action,503
3,Thriller,492
4,Romance,471


In [174]:
movies.Genres.str.split('|', expand=True).stack().value_counts().reset_index().head()

Unnamed: 0,index,0
0,Drama,1603
1,Comedy,1200
2,Action,503
3,Thriller,492
4,Romance,471


### Find best movie of all times

In [175]:
movies.head()

Unnamed: 0,MovieID,Title,Genres,Year,Year_int
0,1,Toy Story,Animation|Children's|Comedy,1995-01-01,1995
1,2,Jumanji,Adventure|Children's|Fantasy,1995-01-01,1995
2,3,Grumpier Old Men,Comedy|Romance,1995-01-01,1995
3,4,Waiting to Exhale,Comedy|Drama,1995-01-01,1995
4,5,Father of the Bride Part II,Comedy,1995-01-01,1995


In [176]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [207]:
movies_to_ratings = movies.merge(ratings, left_on='MovieID', right_on='MovieID')
movies_to_ratings.groupby('Title')[['Rating']].mean().sort_values(ascending=False, by='Rating').head(20)

Unnamed: 0_level_0,Rating
Title,Unnamed: 1_level_1
Smashing Time,5.0
Song of Freedom,5.0
One Little Indian,5.0
Ulysses (Ulisse),5.0
"Baby, The",5.0
Follow the Bitch,5.0
Schlafes Bruder (Brother of Sleep),5.0
"Gate of Heavenly Peace, The",5.0
Bittersweet Motel,5.0
Lured,5.0


In [208]:
movies_to_ratings.groupby('Title').agg({'Rating': ['mean'], 'UserID': 'count'}).sort_values([('Rating','mean'),('UserID','count')], ascending=False).head(20)

Unnamed: 0_level_0,Rating,UserID
Unnamed: 0_level_1,mean,count
Title,Unnamed: 1_level_2,Unnamed: 2_level_2
"Gate of Heavenly Peace, The",5.0,3
Smashing Time,5.0,2
"Baby, The",5.0,1
Bittersweet Motel,5.0,1
Follow the Bitch,5.0,1
Lured,5.0,1
One Little Indian,5.0,1
Schlafes Bruder (Brother of Sleep),5.0,1
Song of Freedom,5.0,1
Ulysses (Ulisse),5.0,1


In [236]:
movies_count = pd.merge(movies, ratings).groupby('Title').agg({'Rating': ['mean'], 'UserID': 'count'}).sort_values([('Rating','mean'),('UserID','count')], ascending=False)
movies_count[movies_count[('UserID','count')]>100].head()

Unnamed: 0_level_0,Rating,UserID
Unnamed: 0_level_1,mean,count
Title,Unnamed: 1_level_2,Unnamed: 2_level_2
Seven Samurai (The Magnificent Seven) (Shichinin no samurai),4.56051,628
"Shawshank Redemption, The",4.554558,2227
"Godfather, The",4.524966,2223
"Close Shave, A",4.520548,657
"Usual Suspects, The",4.517106,1783


In [255]:
movies_count = pd.merge(movies, ratings).groupby('Title').agg({'Rating': ['mean','size']})
movies_count[movies_count[('Rating', 'size')]>100].sort_values([('Rating','mean'),('Rating','size')], ascending=False).head()

Unnamed: 0_level_0,Rating,Rating
Unnamed: 0_level_1,mean,size
Title,Unnamed: 1_level_2,Unnamed: 2_level_2
Seven Samurai (The Magnificent Seven) (Shichinin no samurai),4.56051,628
"Shawshank Redemption, The",4.554558,2227
"Godfather, The",4.524966,2223
"Close Shave, A",4.520548,657
"Usual Suspects, The",4.517106,1783


### Best movie by gender

In [292]:
movies_m_f = pd.merge(movies, ratings)
movies_m_f = movies_m_f.merge(users, left_on='UserID', right_on='UserID')
movies_m_f = movies_m_f.groupby(['Title', 'Gender']).agg({'Rating': ['mean','size']}).unstack()
# movies_m_f = movies_m_f.sort_values([('Rating','mean'),('Rating','size')], ascending=False)
movies_m_f = movies_m_f[movies_m_f.loc[:,('Rating','size')]['F'] > 100]
movies_m_f = movies_m_f[movies_m_f.loc[:,('Rating','size')]['M'] > 100]
movies_m_f.head()

Unnamed: 0_level_0,Rating,Rating,Rating,Rating
Unnamed: 0_level_1,mean,mean,size,size
Gender,F,M,F,M
Title,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
10 Things I Hate About You,3.646552,3.311966,232.0,468.0
101 Dalmatians,3.545994,3.287162,337.0,592.0
12 Angry Men,4.184397,4.328421,141.0,475.0
"13th Warrior, The",3.112,3.168,125.0,625.0
2001: A Space Odyssey,3.825581,4.129738,344.0,1372.0


In [295]:
ratings_users = pd.merge(ratings, users)

In [297]:
mean_ratings = ratings_users.groupby(["MovieID", "Gender"]).agg({"Rating" : ["mean", "size"]})
mean_ratings.columns = ["mean", "size"]
mean_ratings = mean_ratings.reset_index()
mean_ratings100 = mean_ratings[mean_ratings["size"] > 100]
pd.merge(mean_ratings100, movies).sort_values(by="mean", ascending=False)\
                                                    .drop_duplicates(subset="Gender")

Unnamed: 0,MovieID,Gender,mean,size,Title,Genres,Year,Year_int
474,745,F,4.644444,180,"Close Shave, A",Animation|Comedy|Thriller,1995-01-01,1995
521,858,M,4.583333,1740,"Godfather, The",Action|Crime|Drama,1972-01-01,1972


### Mean year by age group

In [299]:
movies_join = pd.merge(movies, ratings)
movies_join = movies_join.merge(users, left_on='UserID', right_on='UserID')
movies_join.groupby('Age')[['Year_int']].mean()

Unnamed: 0_level_0,Year_int
Age,Unnamed: 1_level_1
1,1988.981699
18,1989.701982
25,1987.972972
35,1984.965478
45,1983.50052
50,1982.483211
56,1981.549097


### 3 most watched genres by gender

In [305]:
movies_join[movies_join.Gender == 'F'].groupby('Title').size().sort_values(ascending=False).reset_index().head(3)

Unnamed: 0,Title,0
0,American Beauty,946
1,Shakespeare in Love,798
2,"Silence of the Lambs, The",706


In [306]:
movies_join[movies_join.Gender == 'M'].groupby('Title').size().sort_values(ascending=False).reset_index().head(3)

Unnamed: 0,Title,0
0,American Beauty,2482
1,Star Wars: Episode IV - A New Hope,2344
2,Star Wars: Episode V - The Empire Strikes Back,2342


In [316]:
male = movies_join[movies_join.Gender == 'M']
female = movies_join[movies_join.Gender == 'F']

In [317]:
male.Genres.str.split("|", expand=True).stack().reset_index(drop=True).value_counts()

Comedy         260309
Drama          256376
Action         211807
Thriller       149372
Sci-Fi         129894
Adventure      106621
Romance         97226
Crime           63099
Horror          61751
War             54434
Children's      50869
Animation       31072
Mystery         30202
Musical         28028
Fantasy         27583
Western         17206
Film-Noir       14059
Documentary      5970
dtype: int64

In [318]:
female.Genres.str.split("|", expand=True).stack().reset_index(drop=True).value_counts()

Drama          98153
Comedy         96271
Romance        50297
Action         45650
Thriller       40308
Sci-Fi         27400
Adventure      27332
Children's     21317
Crime          16442
Horror         14635
War            14093
Musical        13505
Animation      12221
Mystery         9976
Fantasy         8718
Film-Noir       4202
Western         3477
Documentary     1940
dtype: int64