In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
%matplotlib inline

In [4]:
# Introduction to statistics in Python
games = pd.read_csv("./appstore_games.csv")

In [5]:
games.shape

(17007, 18)

In [6]:
games.head()

Unnamed: 0,URL,ID,Name,Subtitle,Icon URL,Average User Rating,User Rating Count,Price,In-app Purchases,Description,Developer,Age Rating,Languages,Size,Primary Genre,Genres,Original Release Date,Current Version Release Date
0,https://apps.apple.com/us/app/sudoku/id284921427,284921427,Sudoku,,https://is2-ssl.mzstatic.com/image/thumb/Purpl...,4.0,3553.0,2.99,,"Join over 21,000,000 of our fans and download ...",Mighty Mighty Good Games,4+,"DA, NL, EN, FI, FR, DE, IT, JA, KO, NB, PL, PT...",15853568.0,Games,"Games, Strategy, Puzzle",11/07/2008,30/05/2017
1,https://apps.apple.com/us/app/reversi/id284926400,284926400,Reversi,,https://is4-ssl.mzstatic.com/image/thumb/Purpl...,3.5,284.0,1.99,,"The classic game of Reversi, also known as Oth...",Kiss The Machine,4+,EN,12328960.0,Games,"Games, Strategy, Board",11/07/2008,17/05/2018
2,https://apps.apple.com/us/app/morocco/id284946595,284946595,Morocco,,https://is5-ssl.mzstatic.com/image/thumb/Purpl...,3.0,8376.0,0.0,,Play the classic strategy game Othello (also k...,Bayou Games,4+,EN,674816.0,Games,"Games, Board, Strategy",11/07/2008,5/09/2017
3,https://apps.apple.com/us/app/sudoku-free/id28...,285755462,Sudoku (Free),,https://is3-ssl.mzstatic.com/image/thumb/Purpl...,3.5,190394.0,0.0,,"Top 100 free app for over a year.\nRated ""Best...",Mighty Mighty Good Games,4+,"DA, NL, EN, FI, FR, DE, IT, JA, KO, NB, PL, PT...",21552128.0,Games,"Games, Strategy, Puzzle",23/07/2008,30/05/2017
4,https://apps.apple.com/us/app/senet-deluxe/id2...,285831220,Senet Deluxe,,https://is1-ssl.mzstatic.com/image/thumb/Purpl...,3.5,28.0,2.99,,"""Senet Deluxe - The Ancient Game of Life and A...",RoGame Software,4+,"DA, NL, EN, FR, DE, EL, IT, JA, KO, NO, PT, RU...",34689024.0,Games,"Games, Strategy, Board, Education",18/07/2008,22/07/2018


In [7]:
games.dtypes

URL                              object
ID                                int64
Name                             object
Subtitle                         object
Icon URL                         object
Average User Rating             float64
User Rating Count               float64
Price                           float64
In-app Purchases                 object
Description                      object
Developer                        object
Age Rating                       object
Languages                        object
Size                            float64
Primary Genre                    object
Genres                           object
Original Release Date            object
Current Version Release Date     object
dtype: object

In [9]:
# Standardise the column headers
columns_dict = {x: x.lower().replace(" ", "_") for x in games.columns}
columns_dict_dict

{'URL': 'url',
 'ID': 'id',
 'Name': 'name',
 'Subtitle': 'subtitle',
 'Icon URL': 'icon_url',
 'Average User Rating': 'average_user_rating',
 'User Rating Count': 'user_rating_count',
 'Price': 'price',
 'In-app Purchases': 'in-app_purchases',
 'Description': 'description',
 'Developer': 'developer',
 'Age Rating': 'age_rating',
 'Languages': 'languages',
 'Size': 'size',
 'Primary Genre': 'primary_genre',
 'Genres': 'genres',
 'Original Release Date': 'original_release_date',
 'Current Version Release Date': 'current_version_release_date'}

In [11]:
games.rename(columns = columns_dict, inplace = True)

In [12]:
games.set_index(keys = 'id', inplace = True)

In [13]:
games.drop(columns = ['url', 'icon_url'], inplace = True)

In [14]:
games["original_release_date"] = pd.to_datetime(games["original_release_date"])
games["current_version_release_date"] = pd.to_datetime(games["current_version_release_date"])

In [15]:
games.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17007 entries, 284921427 to 1475076711
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   name                          17007 non-null  object        
 1   subtitle                      5261 non-null   object        
 2   average_user_rating           7561 non-null   float64       
 3   user_rating_count             7561 non-null   float64       
 4   price                         16983 non-null  float64       
 5   in-app_purchases              7683 non-null   object        
 6   description                   17007 non-null  object        
 7   developer                     17007 non-null  object        
 8   age_rating                    17007 non-null  object        
 9   languages                     16947 non-null  object        
 10  size                          17006 non-null  float64       
 11  primary_genre  

In [19]:
# Start to check on missing values
games.isnull().sum()

name                                0
subtitle                        11746
average_user_rating              9446
user_rating_count                9446
price                              24
in-app_purchases                 9324
description                         0
developer                           0
age_rating                          0
languages                          60
size                                1
primary_genre                       0
genres                              0
original_release_date               0
current_version_release_date        0
dtype: int64

In [23]:
# Compare the missing values in average_user_rating and user_rating_count
np.array_equal(games.average_user_rating.isnull(),
              games.user_rating_count.isnull())

True

In [25]:
# Exclude rows with null values in user_ratings
games = games.loc[games.average_user_rating.notnull()]

In [26]:
games.isnull().sum()

name                               0
subtitle                        4837
average_user_rating                0
user_rating_count                  0
price                              0
in-app_purchases                2927
description                        0
developer                          0
age_rating                         0
languages                         24
size                               0
primary_genre                      0
genres                             0
original_release_date              0
current_version_release_date       0
dtype: int64

In [27]:
# Exclude rows with a user_rating_count of less than 30
games = games.loc[games.user_rating_count >= 30]

In [28]:
games.isnull().sum()

name                               0
subtitle                        2523
average_user_rating                0
user_rating_count                  0
price                              0
in-app_purchases                1313
description                        0
developer                          0
age_rating                         0
languages                         14
size                               0
primary_genre                      0
genres                             0
original_release_date              0
current_version_release_date       0
dtype: int64

In [29]:
games.shape

(4311, 15)

In [30]:
# Exercise 7.01: Using a string column to produce a numerical column
# 1.
games2 = games.copy()

In [31]:
games2.languages.head()

id
284921427    DA, NL, EN, FI, FR, DE, IT, JA, KO, NB, PL, PT...
284926400                                                   EN
284946595                                                   EN
285755462    DA, NL, EN, FI, FR, DE, IT, JA, KO, NB, PL, PT...
286210009                                                   EN
Name: languages, dtype: object

In [33]:
games2.languages = games2.languages.fillna("EN")

In [38]:
list_of_languages = games2.languages.str.split(",")
list_of_languages

id
284921427     [DA,  NL,  EN,  FI,  FR,  DE,  IT,  JA,  KO,  ...
284926400                                                  [EN]
284946595                                                  [EN]
285755462     [DA,  NL,  EN,  FI,  FR,  DE,  IT,  JA,  KO,  ...
286210009                                                  [EN]
                                    ...                        
1471336822                                                 [EN]
1471595571                                                 [EN]
1473181500                                                 [EN]
1473559573                                                 [EN]
1474461379                                                 [EN]
Name: languages, Length: 4311, dtype: object

In [39]:
games2['n_languages'] = list_of_languages.apply(lambda x: len(x))

In [40]:
games2.n_languages

id
284921427     17
284926400      1
284946595      1
285755462     17
286210009      1
              ..
1471336822     1
1471595571     1
1473181500     1
1473559573     1
1474461379     1
Name: n_languages, Length: 4311, dtype: int64

In [43]:
# Descriptive Statistics
random_ratings = games.average_user_rating.sample(n = 300)
for r in random_ratings:
    print(r, end = ", ")

4.5, 4.5, 3.0, 2.5, 4.5, 4.5, 4.5, 4.5, 4.0, 4.0, 4.0, 4.5, 4.0, 3.5, 4.5, 2.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 3.5, 4.5, 4.0, 4.5, 4.0, 4.0, 4.5, 4.0, 3.5, 4.5, 4.5, 4.5, 2.5, 4.5, 3.5, 4.5, 4.5, 4.0, 4.5, 4.0, 4.5, 4.5, 4.5, 4.5, 4.0, 4.5, 3.5, 4.5, 4.5, 5.0, 4.0, 4.5, 4.0, 4.5, 4.5, 4.5, 3.5, 4.0, 4.0, 4.5, 4.5, 4.5, 4.5, 3.5, 5.0, 4.5, 4.5, 4.0, 4.5, 4.5, 5.0, 4.5, 3.0, 4.5, 4.5, 4.0, 2.0, 4.0, 3.5, 4.5, 3.5, 4.5, 3.5, 4.5, 4.0, 4.5, 5.0, 3.5, 3.0, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.0, 3.0, 4.5, 5.0, 4.5, 4.0, 4.0, 3.5, 4.0, 4.5, 4.0, 4.5, 4.0, 4.0, 5.0, 5.0, 4.5, 4.5, 4.0, 4.5, 4.0, 4.0, 4.5, 4.0, 5.0, 4.5, 4.5, 3.5, 4.5, 4.0, 5.0, 4.0, 4.0, 4.0, 4.5, 4.5, 2.5, 4.0, 4.5, 4.5, 4.5, 4.0, 4.5, 3.0, 4.5, 4.5, 4.5, 4.5, 5.0, 4.5, 4.5, 4.5, 4.0, 4.0, 4.0, 4.5, 5.0, 4.5, 4.5, 4.5, 3.5, 4.5, 5.0, 4.0, 4.5, 4.0, 4.0, 5.0, 4.5, 4.0, 4.0, 4.5, 3.5, 4.5, 4.5, 4.0, 5.0, 3.5, 4.5, 4.0, 4.0, 5.0, 4.0, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.0, 4.0, 4.5, 3.5, 4.5, 3.0, 4.0, 3.5, 4.5, 4.5, 4.5, 4.5, 

In [None]:
# Using descriptive statistics
