In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as ss
import plotly.express as px

In [2]:
df_games = pd.read_csv('games.csv')
df_games

FileNotFoundError: [Errno 2] No such file or directory: 'games.csv'

In [None]:
# Info about DataFrame
def overview(df):
    print(" DATA INFO ".center(125,'-'))
    print(df.info())
    
    print(" SHAPE OF DATASET ".center(125,'-'))
    print('Rows:{}'.format(df.shape[0]))
    print('Columns:{}'.format(df.shape[1]))
    
    print(" DATA TYPES ".center(125,'-'))
    print(df.dtypes)
    
    print(" STATISTICS OF DATA ".center(125,'-'))
    print(df.describe(include="all"))
    
    print(" MISSING VALUES ".center(125,'-'))
    print(df.isnull().sum()[df.isnull().sum()>0].sort_values(ascending = False))
    
    print(" DUPLICATED VALUES ".center(125,'-'))
    print(df.duplicated().sum())

overview(df_games)

Data description:

    Rank - rank by sales volume
    Name - name of the game
    Platform - platform on which the game was released
    Year - year of release
    Genre - genre
    Publisher - publisher
    NA_Sales - sales in North America, in millions.
    EU_Sales - sales in Europe, in millions.
    JP_Sales - sales in Japan, in millions.
    Other_Sales - sales in the rest of the world, in millions.
    Global_Sales - worldwide sales, in mln.

Basic information:
- Shape: (16598, 11)
- Year is a float
- Missing values are present
- There are no duplicates

In [None]:
#  Removing NaN's
print("Shape of data before removing NaN's",df_games.shape)
df_games.dropna(inplace=True)
print("Shape of data after removing NaN's",df_games.shape)

In [None]:
# Checking missing values
print("Missing values in each column after cleaning customerID:\n",df_games.isnull().sum())

In [None]:
# Year to the integer
df_games.Year = df_games.Year.astype('int')

In [None]:
# Vizualization amount of games per year
plt.figure(figsize=(19,9))
sns.countplot(df_games.Year, x=df_games.Year)

In [None]:
df_games.Year.describe()

In [None]:
ss.mode(df_games.Year, keepdims=True)

The median is greater than the average. The mod is equal to 2009, about half of the games released after 2007

In [None]:
# Platforms have the most releases (frequency of occurrence is more than 7%)
df_games.Platform \
    .value_counts(normalize=True) \
    .mul(100) \
    .to_frame('pct') \
    .query('pct > 7')

In [None]:
# Top 10 publishers whose games are most common
df_games.Publisher.value_counts().to_frame('Number').head(10)

In [None]:
df_only_nintendo = df_games.query('Publisher == "Nintendo"')

In [None]:
# Mean and median of Nintendo sales in different regions (Regions: NA, EU, JP, Other)
df_only_nintendo \
    [['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']] \
    .agg(['mean', 'median'])

In [None]:
# Sales volumes of Nintendo games in Japan by genre.  median, mean, interquartile range
plt.figure(figsize=(12,12))
sns.boxplot(data=df_only_nintendo, x='Genre', y='JP_Sales')

In [None]:
df_only_nintendo.groupby('Genre')['JP_Sales'].describe()

In [None]:
# Visualization of the dynamics of changes in global sales by year for Nintendo games of the following genres:
# Fighting, Simulation, Platform, Racing, Sports.
selected_genres = ['Fighting', 'Simulation', 'Platform', 'Racing', 'Sports']
nintendo_selected_genres = df_only_nintendo[df_only_nintendo['Genre'].isin(selected_genres)]
grouped_data = nintendo_selected_genres.groupby(['Genre', 'Year'])['Global_Sales'].sum().reset_index()
px.line(grouped_data,
        x='Year',
        y='Global_Sales',
        color='Genre'
       )