In [2]:
import pandas as pd
import re

In [3]:
ratings_movies = pd.read_csv('data/ratings_movies.csv')

In [4]:
ratings_movies.head(3)

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating,date,title,genres
0,0,1,1,4.0,2000-07-30 18:45:03,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,1,3,4.0,2000-07-30 18:20:47,Grumpier Old Men (1995),Comedy|Romance
2,2,1,6,4.0,2000-07-30 18:37:04,Heat (1995),Action|Crime|Thriller


In [5]:
r'\(\d{4}\)'

'\\(\\d{4}\\)'

In [6]:
re.findall(r'\(\d{4}\)', '(2000) (1998)')

['(2000)', '(1998)']

In [7]:
def get_year_release(arg):
    candidates = re.findall(r'\(\d{4}\)', arg)
    if len(candidates) > 0:
        year = candidates[0].replace('(', '')
        year = year.replace(')', '')
        return int(year)
    else: 
        return None
        

In [8]:
ratings_movies['year_release'] = ratings_movies['title'].apply(get_year_release)

In [9]:
ratings_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Unnamed: 0    100836 non-null  int64  
 1   userId        100836 non-null  int64  
 2   movieId       100836 non-null  int64  
 3   rating        100836 non-null  float64
 4   date          100836 non-null  object 
 5   title         100836 non-null  object 
 6   genres        100836 non-null  object 
 7   year_release  100818 non-null  float64
dtypes: float64(2), int64(3), object(3)
memory usage: 6.2+ MB


In [10]:
ratings_movies[ratings_movies['year_release'] == 1999].loc[:, ['title', 'rating']].groupby('title').mean().sort_values(by='rating')

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
Bloodsport: The Dark Kumite (1999),0.5
Simon Sez (1999),1.0
Chill Factor (1999),1.0
"Source, The (1999)",1.0
Trippin' (1999),1.0
...,...
Trailer Park Boys (1999),5.0
Larry David: Curb Your Enthusiasm (1999),5.0
Sun Alley (Sonnenallee) (1999),5.0
George Carlin: You Are All Diseased (1999),5.0


In [11]:
ratings_movies[ratings_movies['year_release'] == 2010].loc[:, ['genres', 'year_release', 'rating']].groupby('genres').mean().sort_values(by='rating')

Unnamed: 0_level_0,year_release,rating
genres,Unnamed: 1_level_1,Unnamed: 2_level_1
Action|Sci-Fi,2010.0,1.000000
Action|Adventure|Horror,2010.0,1.500000
Action|Drama|Fantasy,2010.0,1.500000
Crime|Romance,2010.0,1.500000
Adventure|Comedy|Fantasy,2010.0,1.833333
...,...,...
Crime,2010.0,4.750000
Comedy|Musical,2010.0,5.000000
Animation|Drama|Fantasy|Mystery,2010.0,5.000000
Adventure|Children|Comedy|Mystery,2010.0,5.000000


In [12]:
ratings_movies.loc[:, ['userId', 'genres']].groupby('userId').nunique().sort_values(by='genres', ascending=False)

Unnamed: 0_level_0,genres
userId,Unnamed: 1_level_1
599,524
414,482
448,403
380,399
474,395
...,...
578,15
12,15
85,13
214,13


In [13]:
ratings_movies[ratings_movies['userId'] == 599].loc[:, ['userId', 'genres']].nunique()

userId      1
genres    524
dtype: int64

In [14]:
ratings_movies.groupby('userId')['rating'].agg(
    ['count', 'mean']
).sort_values(by=['count', 'mean'], ascending=[True, False])

Unnamed: 0_level_0,count,mean
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
53,20,5.000000
595,20,4.200000
189,20,4.100000
569,20,4.000000
278,20,3.875000
...,...,...
274,1346,3.235884
448,1864,2.847371
474,2108,3.398956
599,2478,2.642050


In [15]:
g2018 = ratings_movies[ratings_movies['year_release'] == 2018].groupby('genres')['rating'].agg(['mean', 'count'])
g2018[g2018['count'] > 10].sort_values(by=['mean', 'count'], ascending=[False, True])

Unnamed: 0_level_0,mean,count
genres,Unnamed: 1_level_1,Unnamed: 2_level_1
Action|Adventure|Sci-Fi,3.928571,14
Action|Comedy|Sci-Fi,3.875,12


In [16]:
ratings_movies['date'].head()

0    2000-07-30 18:45:03
1    2000-07-30 18:20:47
2    2000-07-30 18:37:04
3    2000-07-30 19:03:35
4    2000-07-30 18:48:51
Name: date, dtype: object

In [17]:
ratings_movies['date'] = pd.to_datetime(ratings_movies['date'], dayfirst=False)
ratings_movies['year_rating'] = ratings_movies['date'].dt.year
ratings_movies['year_rating'].head()

0    2000
1    2000
2    2000
3    2000
4    2000
Name: year_rating, dtype: int64

In [18]:
pivot = ratings_movies.pivot_table(
    index='year_rating',
    values='rating',
    columns='genres',
    aggfunc='mean', 
    fill_value=0
    
)

pivot.head(3)

genres,(no genres listed),Action,Action|Adventure,Action|Adventure|Animation,Action|Adventure|Animation|Children,Action|Adventure|Animation|Children|Comedy,Action|Adventure|Animation|Children|Comedy|Fantasy,Action|Adventure|Animation|Children|Comedy|IMAX,Action|Adventure|Animation|Children|Comedy|Romance,Action|Adventure|Animation|Children|Comedy|Sci-Fi,...,Romance|Thriller,Romance|War,Romance|Western,Sci-Fi,Sci-Fi|IMAX,Sci-Fi|Thriller,Sci-Fi|Thriller|IMAX,Thriller,War,Western
year_rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1996,0.0,2.730769,3.454545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.666667,0.0,3.838095,0.0,3.117647
1997,0.0,3.538462,4.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.4,0.0,3.923077,0.0,3.0
1998,0.0,0.0,4.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.8,0.0,0.0


In [19]:
pivot.sort_values(by='year_rating', ascending=False)

genres,(no genres listed),Action,Action|Adventure,Action|Adventure|Animation,Action|Adventure|Animation|Children,Action|Adventure|Animation|Children|Comedy,Action|Adventure|Animation|Children|Comedy|Fantasy,Action|Adventure|Animation|Children|Comedy|IMAX,Action|Adventure|Animation|Children|Comedy|Romance,Action|Adventure|Animation|Children|Comedy|Sci-Fi,...,Romance|Thriller,Romance|War,Romance|Western,Sci-Fi,Sci-Fi|IMAX,Sci-Fi|Thriller,Sci-Fi|Thriller|IMAX,Thriller,War,Western
year_rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018,3.676471,2.588235,3.397436,3.65,3.166667,3.741935,4.142857,3.7,2.5,4.0,...,0.0,0.0,0.0,3.128571,3.5,3.5625,0.0,3.296296,0.0,3.611111
2017,3.0,2.637931,3.527778,3.375,3.5,3.676471,3.7,1.0,1.0,2.0,...,3.0,0.0,0.0,3.864865,4.264706,3.75,2.5,3.152174,0.0,2.933333
2016,3.5,3.545455,3.722222,3.75,3.833333,3.925,3.388889,3.666667,1.0,4.0,...,2.5,0.0,0.0,3.555556,3.807692,3.916667,3.666667,3.432432,0.0,3.958333
2015,0.5,3.1,3.560976,3.8,3.666667,3.565217,3.833333,3.375,2.333333,2.5,...,0.0,0.0,0.0,2.9,4.071429,3.764706,4.0,2.911765,0.0,3.9
2014,0.0,0.0,3.875,3.0,3.5,3.25,3.833333,3.5,4.0,0.0,...,0.0,0.0,0.0,2.5,5.0,0.0,4.0,4.0,0.0,4.0
2013,0.0,2.5,3.833333,0.0,3.0,4.333333,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,1.5,0.0,3.833333,4.0,3.666667,0.0,4.0
2012,0.0,2.625,3.9,0.0,4.0,3.785714,0.0,3.55,4.0,0.0,...,2.5,0.0,0.0,4.166667,0.0,3.5,3.666667,3.083333,0.0,4.1
2011,0.0,4.5,3.888889,0.0,0.0,3.5,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,3.5,0.0,1.833333,4.0,3.625,0.0,4.0
2010,0.0,3.5,3.4375,0.0,0.0,4.333333,0.0,3.5,3.0,0.0,...,0.0,0.0,0.0,4.0,0.0,2.5,0.0,2.833333,0.0,3.666667
2009,0.0,0.0,3.714286,0.0,0.0,3.9,0.0,3.25,0.0,2.5,...,0.0,0.0,0.0,2.0,0.0,4.0,0.0,2.964286,4.0,3.375


In [20]:
#1
#За весь период (с 1996 по 2018 год) сочетание жанров Action|Adventure ни разу не получало среднюю оценку ниже 3
pivot.loc[1996:2018, 'Action|Adventure'].sort_values().head(3)


year_rating
2003    3.277778
2018    3.397436
2005    3.413043
Name: Action|Adventure, dtype: float64

In [21]:
pivot['Action|Adventure|Animation|Children|Comedy|IMAX'].sort_values(ascending=False).head()

year_rating
2013    5.000000
2018    3.700000
2016    3.666667
2012    3.550000
2014    3.500000
Name: Action|Adventure|Animation|Children|Comedy|IMAX, dtype: float64

In [22]:
pivot.loc[2018, :].sort_values(ascending=False).head(10)

genres
Comedy|Horror|Mystery                        5.0
Comedy|Fantasy|Horror|Musical|Thriller       5.0
Animation|Children|Mystery                   5.0
Comedy|Crime|Horror|Thriller                 5.0
Drama|Horror|Mystery|Sci-Fi|Thriller         5.0
Adventure|Children|Comedy|Fantasy|Mystery    5.0
Action|Crime|Thriller|Western                5.0
Comedy|Crime|Fantasy                         5.0
Crime|Mystery                                5.0
Drama|Romance|War                            5.0
Name: 2018, dtype: float64

In [23]:
(pivot['Comedy'].diff() < 0).value_counts()

False    12
True     11
Name: Comedy, dtype: int64

In [24]:
orders = pd.read_csv('data/orders_and_products/orders.csv', sep=';')
orders.head()

Unnamed: 0,Дата создания,Order ID,ID Покупателя,Статус,Оплачен,Отменен,Отгружен,ID товара,Количество
0,09.11.2019 21:55:51,9,10,"Принят, ожидается оплата",Нет,Нет,Нет,103,5
1,09.11.2019 15:05:57,8,9,"Принят, ожидается оплата",Нет,Нет,Нет,86,100
2,09.11.2019 15:05:57,8,9,"Принят, ожидается оплата",Нет,Нет,Нет,104,10
3,09.11.2019 12:50:07,7,8,"Принят, ожидается оплата",Нет,Нет,Нет,104,7
4,09.11.2019 12:00:00,6,1,"Принят, ожидается оплата",Нет,Нет,Нет,104,5


In [25]:
products = pd.read_csv('data/orders_and_products/products.csv', sep=';')
products.head()

Unnamed: 0,Product_ID,Name,Price,CURRENCY
0,47,Шатны Полосатый рейс,2999,RUR
1,51,Платье Аленький цветочек,4999,RUR
2,53,Штаны Цветочная Поляна,4999,RUR
3,71,Платье Ночная Жизнь,7999,RUR
4,74,Платье Ночная Жизнь XXXL,8999,RUR


In [26]:
orders_products = orders.merge(products, how='left', left_on='ID товара', right_on='Product_ID')
orders_products.head(3)

Unnamed: 0,Дата создания,Order ID,ID Покупателя,Статус,Оплачен,Отменен,Отгружен,ID товара,Количество,Product_ID,Name,Price,CURRENCY
0,09.11.2019 21:55:51,9,10,"Принят, ожидается оплата",Нет,Нет,Нет,103,5,103.0,"Носки Подарочные, муж",199.0,RUR
1,09.11.2019 15:05:57,8,9,"Принят, ожидается оплата",Нет,Нет,Нет,86,100,86.0,"Носки Простые, муж",45.0,RUR
2,09.11.2019 15:05:57,8,9,"Принят, ожидается оплата",Нет,Нет,Нет,104,10,104.0,"Носки Подарочные, жен",249.0,RUR


In [27]:
orders_products[orders_products['Name'].isnull() == True]

Unnamed: 0,Дата создания,Order ID,ID Покупателя,Статус,Оплачен,Отменен,Отгружен,ID товара,Количество,Product_ID,Name,Price,CURRENCY
17,01.01.2001 00:00:00,0,1,"Оплачен, формируется к отправке",Да,Нет,Нет,666,1,,,,


In [28]:
orders_products[orders_products['Отменен'] != 'Нет']

Unnamed: 0,Дата создания,Order ID,ID Покупателя,Статус,Оплачен,Отменен,Отгружен,ID товара,Количество,Product_ID,Name,Price,CURRENCY
6,08.11.2019 08:36:22,5,5,Отменён,Нет,Да,Нет,124,1,124.0,Носки беговые Camino,999.0,RUR


In [29]:
sold = orders_products[orders_products['Отменен'] == 'Нет']
sold['Profit'] = sold['Price'] * sold['Количество']
sold.groupby('ID Покупателя')['Profit'].sum().sort_values()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sold['Profit'] = sold['Price'] * sold['Количество']


ID Покупателя
10      995.0
1      2240.0
8      3087.0
9      7040.0
5     13043.0
7     17096.0
Name: Profit, dtype: float64

In [30]:
pd.to_datetime(ratings_movies['date'], dayfirst=True).dt.day

0         30
1         30
2         30
3         30
4         30
          ..
100831     3
100832     3
100833     8
100834     3
100835     3
Name: date, Length: 100836, dtype: int64

In [31]:
a = ratings_movies.groupby(by='genres', as_index=True)['rating'].agg('describe')
a

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
genres,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
(no genres listed),47.0,3.489362,1.218023,0.5,2.750,3.5,4.5,5.0
Action,186.0,2.935484,1.073579,0.5,2.000,3.0,3.5,5.0
Action|Adventure,555.0,3.706306,1.067343,0.5,3.000,4.0,4.5,5.0
Action|Adventure|Animation,42.0,3.583333,0.854805,2.0,3.000,4.0,4.0,5.0
Action|Adventure|Animation|Children,39.0,3.410256,0.958836,0.5,3.000,3.5,4.0,5.0
...,...,...,...,...,...,...,...,...
Sci-Fi|Thriller,116.0,3.280172,1.011432,0.5,2.875,3.5,4.0,5.0
Sci-Fi|Thriller|IMAX,12.0,3.708333,0.541812,2.5,3.500,4.0,4.0,4.5
Thriller,628.0,3.426752,1.036519,0.5,3.000,3.5,4.0,5.0
War,9.0,3.555556,0.726483,3.0,3.000,3.0,4.0,5.0


In [32]:
a.index

Index(['(no genres listed)', 'Action', 'Action|Adventure',
       'Action|Adventure|Animation', 'Action|Adventure|Animation|Children',
       'Action|Adventure|Animation|Children|Comedy',
       'Action|Adventure|Animation|Children|Comedy|Fantasy',
       'Action|Adventure|Animation|Children|Comedy|IMAX',
       'Action|Adventure|Animation|Children|Comedy|Romance',
       'Action|Adventure|Animation|Children|Comedy|Sci-Fi',
       ...
       'Romance|Thriller', 'Romance|War', 'Romance|Western', 'Sci-Fi',
       'Sci-Fi|IMAX', 'Sci-Fi|Thriller', 'Sci-Fi|Thriller|IMAX', 'Thriller',
       'War', 'Western'],
      dtype='object', name='genres', length=951)

In [33]:
ratings_movies.groupby(by='genres', as_index=True)['rating'].agg(['mean', set])

Unnamed: 0_level_0,mean,set
genres,Unnamed: 1_level_1,Unnamed: 2_level_1
(no genres listed),3.489362,"{0.5, 1.0, 2.5, 3.5, 3.0, 4.0, 5.0, 4.5, 2.0}"
Action,2.935484,"{0.5, 1.0, 2.0, 3.5, 3.0, 4.5, 4.0, 2.5, 5.0, ..."
Action|Adventure,3.706306,"{0.5, 1.0, 2.0, 3.5, 4.5, 3.0, 4.0, 5.0, 2.5, ..."
Action|Adventure|Animation,3.583333,"{2.5, 2.0, 3.5, 4.5, 4.0, 5.0, 3.0}"
Action|Adventure|Animation|Children,3.410256,"{0.5, 1.5, 1.0, 3.0, 3.5, 4.0, 5.0, 4.5, 2.5, ..."
...,...,...
Sci-Fi|Thriller,3.280172,"{0.5, 1.5, 2.0, 3.5, 3.0, 4.5, 5.0, 4.0, 1.0, ..."
Sci-Fi|Thriller|IMAX,3.708333,"{2.5, 3.0, 3.5, 4.0, 4.5}"
Thriller,3.426752,"{0.5, 1.0, 2.0, 3.5, 3.0, 4.0, 5.0, 4.5, 2.5, ..."
War,3.555556,"{3.0, 4.0, 5.0}"


In [34]:
set(ratings_movies['rating'])

{0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0}

In [35]:
ratings_movies

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating,date,title,genres,year_release,year_rating
0,0,1,1,4.0,2000-07-30 18:45:03,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995.0,2000
1,1,1,3,4.0,2000-07-30 18:20:47,Grumpier Old Men (1995),Comedy|Romance,1995.0,2000
2,2,1,6,4.0,2000-07-30 18:37:04,Heat (1995),Action|Crime|Thriller,1995.0,2000
3,3,1,47,5.0,2000-07-30 19:03:35,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1995.0,2000
4,4,1,50,5.0,2000-07-30 18:48:51,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,1995.0,2000
...,...,...,...,...,...,...,...,...,...
100831,100831,610,166534,4.0,2017-05-03 21:53:22,Split (2017),Drama|Horror|Thriller,2017.0,2017
100832,100832,610,168248,5.0,2017-05-03 22:21:31,John Wick: Chapter Two (2017),Action|Crime|Thriller,2017.0,2017
100833,100833,610,168250,5.0,2017-05-08 19:50:47,Get Out (2017),Horror,2017.0,2017
100834,100834,610,168252,5.0,2017-05-03 21:19:12,Logan (2017),Action|Sci-Fi,2017.0,2017


In [36]:
ratings_movies.pivot_table(
    values='rating',
    index=['genres', 'year_release'],
    aggfunc=['median', 'mean']
)

Unnamed: 0_level_0,Unnamed: 1_level_0,median,mean
Unnamed: 0_level_1,Unnamed: 1_level_1,rating,rating
genres,year_release,Unnamed: 2_level_2,Unnamed: 3_level_2
(no genres listed),1957.0,3.00,3.000000
(no genres listed),1968.0,3.00,3.000000
(no genres listed),1977.0,4.00,4.000000
(no genres listed),1980.0,4.50,4.500000
(no genres listed),1990.0,2.50,2.500000
...,...,...,...
Western,2003.0,4.00,3.944444
Western,2008.0,3.00,3.000000
Western,2010.0,4.00,3.750000
Western,2014.0,2.50,2.500000


In [43]:
pd.Series(
    [1, 2, 3],
    index=[['q', 'q', 'qqq'],
     ['a','b', 'd']]
)

q    a    1
     b    2
qqq  d    3
dtype: int64

In [44]:
import numpy
numpy.random.rand(8)

array([0.54112389, 0.90969888, 0.49112837, 0.96465506, 0.85902142,
       0.73784478, 0.42080117, 0.98814575])

In [45]:
ratings_movies['genres'][3]

'Mystery|Thriller'