# Initialization & Data Preprocessing

In [35]:
import pandas as pd
import requests
from io import BytesIO
import plotly_express as px
from scipy import stats as st
from IPython.display import display

In [2]:
# importing necessary data from gsheet, attempting local path if there's an error

try:
    gsheet_id = '1KQGMyeuqEKlA3STh2Fk4OSEkuoPBEmG_dKnhDQGt5yI' 
    gsheet_url = 'https://docs.google.com/spreadsheets/d/{}/export?format=csv'.format(gsheet_id) 
    r = requests.get(gsheet_url) 
    df = pd.read_csv(BytesIO(r.content))    
except:
    df = pd.read_csv('datasets/moved_games.csv')
    
df.info()
df.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16715 entries, 0 to 16714
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             16713 non-null  object 
 1   Platform         16715 non-null  object 
 2   Year_of_Release  16446 non-null  float64
 3   Genre            16713 non-null  object 
 4   NA_sales         16715 non-null  float64
 5   EU_sales         16715 non-null  float64
 6   JP_sales         16715 non-null  float64
 7   Other_sales      16715 non-null  float64
 8   Critic_Score     8137 non-null   float64
 9   User_Score       10014 non-null  object 
 10  Rating           9949 non-null   object 
dtypes: float64(6), object(5)
memory usage: 1.4+ MB


Unnamed: 0,Name,Platform,Year_of_Release,Genre,NA_sales,EU_sales,JP_sales,Other_sales,Critic_Score,User_Score,Rating
count,16713,16715,16446.0,16713,16715.0,16715.0,16715.0,16715.0,8137.0,10014,9949
unique,11559,31,,12,,,,,,96,8
top,Need for Speed: Most Wanted,PS2,,Action,,,,,,tbd,E
freq,12,2161,,3369,,,,,,2424,3990
mean,,,2006.484616,,0.263377,0.14506,0.077617,0.047342,68.967679,,
std,,,5.87705,,0.813604,0.503339,0.308853,0.186731,13.938165,,
min,,,1980.0,,0.0,0.0,0.0,0.0,13.0,,
25%,,,2003.0,,0.0,0.0,0.0,0.0,60.0,,
50%,,,2007.0,,0.08,0.02,0.0,0.01,71.0,,
75%,,,2010.0,,0.24,0.11,0.04,0.03,79.0,,


In [3]:
class DataSweeper():
    
    
    def __init__(self, df):
        """
        Constructor for the DataCleaner class.

        Params:
            df: A Pandas DataFrame that needs to be cleaned.
        """
        # Ensuring the input is a DataFrame
        if not isinstance(df, pd.DataFrame):
            raise ValueError('Argument must be a Pandas DataFrame')
        
        # Storing the DataFrame in an instance variable otherwise
        self.df = df.copy() # creating copy to avoid modifying the original DF
        
    def clean_columns(self):
        """
        This method cleans the column names of the dataset

        Returns:
            DataFrame: A Pandas DataFrame with columns names that are
            stripped of whitespace, lowercase, and snake_case
        """
        # Accessing DF with "self.df" & formatting the column names
        clean_columns = self.df.columns \
            .str.strip() \
            .str.lower() \
            .str.replace(' ', '_') 
        
        # Checking that column names are formatted properly                         
        if list(self.df.columns) != list(clean_columns):
            self.df.columns = clean_columns
        else:
            pass
        
        return self.df

In [4]:
df.sample()

Unnamed: 0,Name,Platform,Year_of_Release,Genre,NA_sales,EU_sales,JP_sales,Other_sales,Critic_Score,User_Score,Rating
10282,Robotech: Invasion,PS2,2004.0,Shooter,0.05,0.04,0.0,0.01,57.0,tbd,T


### Fixing Data

In [5]:
# Convering columns to lowercase & stripping them of any white space with my own module
df = DataSweeper(df).clean_columns()
df.sample()

# Changing the data type of `user_score` column to `float`
df['user_score'] = pd.to_numeric(df['user_score'], errors='coerce') 

# Confirming changes were made successfully
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16715 entries, 0 to 16714
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             16713 non-null  object 
 1   platform         16715 non-null  object 
 2   year_of_release  16446 non-null  float64
 3   genre            16713 non-null  object 
 4   na_sales         16715 non-null  float64
 5   eu_sales         16715 non-null  float64
 6   jp_sales         16715 non-null  float64
 7   other_sales      16715 non-null  float64
 8   critic_score     8137 non-null   float64
 9   user_score       7590 non-null   float64
 10  rating           9949 non-null   object 
dtypes: float64(7), object(4)
memory usage: 1.4+ MB


<div class='alert alert-info'>

##### The dataset that we will be working with today was provided by the online store, ICE, which sells video games all over the world. It contains historical data of games sold on their platform up to 2016. Our goal is to seek out and identify patterns in past game realease successes (& failures) in order to (1) more accurately spot winners and avoid losers going forward and (2), optimize advertising campaign spend.

Let's take a look at each feature available to us one by one and decided if any cleaning needs to be done before beginning our analysis

1. `name`: name of video game

- Data Type: Appropriate
- <span style="color:red;">**Missing Values: two missing values**</span>
    - I will remove these rows since there will be minimal effect to the outcomes of our analysis

2. `platform`: the console(s) the game was sold on

- Data Type: Appropriate
- Missing Values: None
- Observations: 31 different consoles with the "PS2" being the most common

3. `year_of_release`: the year the game first hit the market

- Data Type: Sufficient
- <span style="color:red;">**Missing Values: Roughly 300-400 missing values**</span>
    - I will remove these rows as there will be minimal effect to the outcomes of our analysis
- Observations: Ranges from 1980 - 2016

4. `genre`: the video game genre

- Data Type: Appropriate
- <span style="color:red;">**Missing Values: 2 missing values**</span>
    - These are likely missing due to there being lack of clarity on the genre of the game
    - I will remove these rows entirely since there will be minimal effect to the outcomes of our analysis
- Observations: 12 different genres are found in ICE's data with "Action" being the most common

5. `na_sales`: north american sales of the game

- Data Type: Appropriate
- Missing Values: None
- Observations: Highest amount of units sold was around 41,000,000 

6. `eu_sales`: european sales of the game

- Data Type: Appropriate
- Missing Values: None
- Observations: Highest amount of units sold was around 29,000,000

7. `jp_sales`: japanese sales of the game

- Data Type: Appropriate
- Missing Values: None
- Observations: Highest amount of units sold was around 10,000,000

8. `other_sales`: sales of the game that took place outside of the three aforementioned markets

- Data Type: Appropriate
- Missing Values: None

9. `critic_score`: score given to the game by critics

- Data Type: Appropriate
- <span style="color:red;">**Missing Values: Roughly half of the values are missing**</span>
    - I will replace these missing values with `0` so they don't affect our analysis
- Observations: Critic score's ranged from 13-98 and the average score is found in the high 60s

10. `user_score`: score given to the game by consumers

- <span style="color:red;">**Data Type: Innappropriate**</span>
    - I will change the data type of this feature to float
- <span style="color:red;">**Missing Values: Roughly 6,700 values are missing**</span>
    - I will replace these missing values with `0` so they don't affect our analysis
- Observations: The most common value is "TBD" so I will replace these by setting `errors='coerce'` when changing the data type of this column with `pd.to_numeric()` since it is unlikely that we will get these scores from the users at this point

11. `rating`: The official ESRB rating which states what age range the game is intended for

- Data Type: Appropriate
- <span style="color:red;">**Missing Values: Roughly 7,000 missing values**</span>
    - I will replace these missing values with `0` so they don't affect our analysis 

</div>

In [6]:
# dropping the rows with missing values in either the name, year of release, or genre columns
df.dropna(subset=['name', 'year_of_release', 'genre'], inplace=True)

# testing that rows were dropped as expecting
print(df['name'].isna().sum())
print(df['year_of_release'].isna().sum())
print(df['genre'].isna().sum())

0
0
0


In [7]:
# calculating % of remaining missing values
total_missing = df.isna().any(axis=1).sum()
percent_missing = round((total_missing / len(df)) * 100)
print(f"Percentage of rows with missing values: {percent_missing}%")

Percentage of rows with missing values: 58%


In [8]:
# filling in missing values in either the rating, user score, or critic score columns with 0
df.fillna(-1, inplace=True)

# testing that there are no longer any missing values
df.isna().any(axis=1).sum()

0

### Enriching Data

In [9]:
# Adding a `total_sales` column 
df['total_sales'] = df['na_sales'] + df['eu_sales'] + df['jp_sales'] + df['other_sales']

# Confirming column was created successfully
df['total_sales'].head()

0    82.54
1    40.24
2    35.52
3    32.77
4    31.38
Name: total_sales, dtype: float64

# Exploratory Data Analysis

<div class='alert alert-info'>

With the data clean, we can now being our analysis. I will look to answer the following questions about the data:

1. Look at how many games were released in different years. Is the data for every period significant?
2. Look at how sales varied from platform to platform. Choose the platforms with the greatest total sales and build a distribution based on data for each year. Find platforms that used to be popular but now have zero sales. How long does it generally take for new platforms to appear and old ones to fade?
3. Determine what period you should take data for. To do so, look at your answers to the previous questions. The data should allow you to build a prognosis for 2017.
4. Work only with the data that you've decided is relevant. Disregard the data for previous years.
5. Which platforms are leading in sales? Which ones are growing or shrinking? Select several potentially profitable platforms.
6. Build a box plot for the global sales of all games, broken down by platform. Are the differences in sales significant? What about average sales on various platforms? Describe your findings.
7. Take a look at how user and professional reviews affect sales for one popular platform (you choose). Build a scatter plot and calculate the correlation between reviews and sales. Draw conclusions.
8. Keeping your conclusions in mind, compare the sales of the same games on other platforms.
9. Take a look at the general distribution of games by genre. What can we say about the most profitable genres? Can you generalize about genres with high and low sales?

</div>

In [36]:
# plotting distribution of game releases per year

releases_per_year = px.histogram(
                            df, 
                            x='year_of_release',
                            title='Number of Games Released per Annum',
                            labels={
                                'year_of_release': 'Year',
                                'count': 'Games Released'})
display(releases_per_year.show())

None

<div class='alert alert-info'> 

ICE did not start to seriously scale their release schedule until 1995. They ultimately hit their peak in the years 2008 and 2009 when they released 1,427 and 1,426 games respectively. Three years after hitting their peak, the total number of games they released dropped more than 50%. At this point in our analysis it's hard to definitively say what could've caused this drop in production, but considering they hit their peak during the Great Financial Crisis of 2008, it's entirely possible that they were affected severely and needed to cut costs. My rationale being that the games released in 2008 and 2009 were being developed at the latest in 2007 - *but more than likely for multiple years prior to their release date* - a time where they were producing more games than ever, hence spending more capital than ever to (1) develop all of these games and (2) advertise their games to consumers which would explain a reduction in game production of this magnitude. As of 2016 they have yet to reach even 50% of their previous highs.

</div>

In [37]:
# plotting the distribution of sales per gaming platform 

by_platform = df.groupby(['platform', 'year_of_release'])['total_sales'].sum().reset_index().sort_values(by='total_sales', ascending=False)

by_platform_hist = px.bar(
                    by_platform, 
                    x='platform', 
                    y='total_sales',
                    title='All-Time Sales per Platform', 
                    labels={
                            'platform': 'Platform',
                            'total_sales': 'Sales (USD Millions)'
                        }
                    )

display(by_platform_hist.show())

None

<div class='alert alert-info'>

The best performing gaming platforms are the Nintendo DS, Playstation's 1 through 3, Wii, and XBOX 360. Let's see how each of these platforms performed over the years 

</div>

In [38]:
# Filtering the original dataframe for data related to only the top platforms
top_platforms = df[df['platform'].isin(['DS', 'PS', 'PS2', 'PS3', 'Wii', 'X360'])]

# Plotting a grouped histogram to see the historic distribution of sales on the top platform

top_platforms_plot = px.histogram(
    top_platforms, 
    x='year_of_release', 
    y='total_sales', 
    color='platform', 
    barmode='group', 
    title='Distribution of Annual Game Sales on ICE\'s Best Selling Platforms',
    labels={
        'year_of_release': 'Year',
        'total_sales': 'Total Sales',
        'platform': 'Platform'
    }
    )

display(top_platforms_plot.show())

None

In [13]:
# creating a dataframe with the years the first and last game was released on each of the top platforms
# in order to create a gaant chart that visualizes the life cycles of each of the top platforms

start_years = top_platforms.groupby('platform')['year_of_release'].min()

end_years = top_platforms.groupby('platform')['year_of_release'].max()

total_sales = top_platforms.groupby('platform')['total_sales'].sum()

platform_lifecycle = pd.DataFrame(
    {
    'start_year': start_years,
    'end_year': end_years,
    'sales':total_sales
    }
    ).reset_index()

# converting the year columns to datetime

platform_lifecycle['start_year'] = pd.to_datetime(platform_lifecycle['start_year'], format='%Y')
platform_lifecycle['end_year'] = pd.to_datetime(platform_lifecycle['end_year'], format='%Y')
platform_lifecycle['lifespan'] = platform_lifecycle['end_year'] - platform_lifecycle['start_year']

In [39]:
# plotting the lifecycles of the top platforms 

fig = px.timeline(   
    platform_lifecycle,
    x_start='start_year',
    x_end='end_year',
    y='platform',
    title='Top Platform Lifecycle'   
)

fig.update_yaxes(autorange="reversed")
fig.update_layout(xaxis_title='Year', yaxis_title='Platform')
display(fig.show())

None

In [15]:
platform_lifecycle['lifespan'].median() / 365

Timedelta('10 days 12:07:53.424657534')

<div class='alert alert-info'>

Prior to 2006 only one platform led game sales at a time with the playstation leading the ranks from 1994 through 2001 when the playstation 2 hit the market. In 2005 the video game industry saw many new platforms hit the market such as the Nintendo DS, Wii, & Xbox 360 which dethroned Playstation as the pen-ultimate gaming platform. Out of all the top platforms, the nintendo DS had the longest lifespan with the first release on the platform in 1985 and the last release in 2013, a life span of almost 30 years! The average lifespan of a top-performing platform was roughly 10 years and it typically took two years for a new platform to dethrone the current best-selling platform for number one.

However all of the top platforms have annuals sales at or close to zero as of 2016. 

</div>

<div class='alert alert-info'>

Now that we've explored the historical data let's begin to unearth the trends and patterns in past successful games & platforms that may indicate future successes going into 2017.

To do this we will filter the data that is relevant to the current environment of the video game market. A market where consumers have more choices than ever for not only a single game, but also the platform. This time frame of relevant data in my eyes started in 2005 when there were three popular gaming platforms available on the market (PS2, DS, XBOX 360).

</div>

In [16]:
df.sample()

Unnamed: 0,name,platform,year_of_release,genre,na_sales,eu_sales,jp_sales,other_sales,critic_score,user_score,rating,total_sales
3908,Iron Man,DS,2008.0,Action,0.35,0.12,0.0,0.05,56.0,7.0,E10+,0.52


In [17]:
# filtering relevant data
sample_df = df[(df['year_of_release'] >= 2010.0) & (df['year_of_release'] < 2016.0)]
sample_df.head()

Unnamed: 0,name,platform,year_of_release,genre,na_sales,eu_sales,jp_sales,other_sales,critic_score,user_score,rating,total_sales
14,Kinect Adventures!,X360,2010.0,Misc,15.0,4.89,0.24,1.69,61.0,6.3,E,21.82
16,Grand Theft Auto V,PS3,2013.0,Action,7.02,9.09,0.98,3.96,97.0,8.2,M,21.05
23,Grand Theft Auto V,X360,2013.0,Action,9.66,5.14,0.06,1.41,97.0,8.1,M,16.27
27,Pokemon Black/Pokemon White,DS,2010.0,Role-Playing,5.51,3.17,5.65,0.8,-1.0,-1.0,-1,15.13
29,Call of Duty: Modern Warfare 3,X360,2011.0,Shooter,9.04,4.24,0.13,1.32,88.0,3.4,M,14.73


<div class='alert alert-info'>

Thank you for catching that 

</div>

In [40]:
# top selling platforms in sample

best_sellers_sample = sample_df.groupby(['platform', 'year_of_release'])['total_sales'].sum().reset_index().sort_values(by='total_sales', ascending=False)

# plotting total sales
fig = px.bar(
    best_sellers_sample,
    x='platform',
    y='total_sales',
    title='Best Selling Platforms',
    labels={
        'total_sales': 'Total Sales (USD millions)',
        'platform': 'Platform'
    }
)

display(fig.show())

None

In [41]:
# what platform's sales are increasing

fig = px.bar(
    best_sellers_sample,
    x='year_of_release',
    y='total_sales',
    color='platform',
    title='Distribution of Annual Game Sales by Platform',
    labels={
        'year_of_release': 'Year',
        'total_sales': 'Total Sales (millions)',
        'platform': 'Platform'
    }
)

display(fig.show())

None

<div class='alert alert-info'>
Taking a look at our graph above it is evident that the PS4 is the top selling platform followed by the XBOX One and Nintendo 3DS. However, the latter two platforms have  loss significant share of their annual sales since the PS4 came on the market. Here are the platforms I expect to be profitable going into 2017:

1. PS4
2. PS5
3. XBOX One

</div>

In [42]:
# box plot for global sales of all games by platform

fig = px.box(
    best_sellers_sample,
    x='platform',
    y='total_sales',
    title='Global Game Sales by Platform',
    labels={
        'platform': 'Platform',
        'total_sales': 'Total Sales (millions)'
    }
)

display(fig.show())

None

<div class='alert alert-info'>

The difference in total sales from the best selling platforms to the worse, is extremely significant. However, an interesting onbservation pertains to the average sales price across platforms. The best selling platform, The Wii, has an average 60,000,000 units sold per game on its platform which is not far off from the PS2's 54,000,000 and the Nintendo 3DS's 47,500,000 units sold per game release. 

</div>

In [43]:
# Filtering for data relevant to the XBOX 360
need_for_speed = df[df['name'] == 'Need for Speed: Most Wanted']

# creating a scatter plot to test correlation between critic scores & total sales

fig = px.scatter(
    need_for_speed,
    x='critic_score',
    y='total_sales',
    title='Critic Reviews vs. Total Sales (Need for Speed: Most Wanted)',
    labels={
        'critic_score': 'Critic Score',
        'total_sales': 'Total Sales'
    }
    )

display(fig.show())

None

In [22]:
# Calculating the pearson's coefficient between critic scores & game sales

critic_corr = need_for_speed['critic_score'].corr(need_for_speed['total_sales'])
print(f'The correlation between Critic Scores & Sales of Need for Speed: Most Wanted is {round(critic_corr, 2)}')

The correlation between Critic Scores & Sales of Need for Speed: Most Wanted is 0.1


In [44]:
# creating a scatter plot to visualize correlation between user scores & total sales

fig = px.scatter(
    need_for_speed,
    x='user_score',
    y='total_sales',
    title='User Reviews vs. Total Sales (Need for Speed: Most Wanted)',
    labels={
        'user_score': 'User Scores',
        'total_sales': 'Total Sales'
    }
    )

display(fig.show())

None

In [24]:
# Calculating the pearson's coefficient between user scores & game sales

user_corr = need_for_speed['user_score'].corr(need_for_speed['total_sales'])
print(f'The correlation between User Scores & Sales of Need for Speed: Most Wanted is {round(user_corr, 2)}')

The correlation between User Scores & Sales of Need for Speed: Most Wanted is -0.04


<div class='alert alert-info'>

Both user & critic scores have a weak positive correlation with total game sales.

Now let's take a look at the distribution of sales of *"Need for Speed: Most Wanted"* across platforms

</div>

In [45]:
# visualizing how the same game performed across all platforms

fig = px.bar(
    need_for_speed,
    x='platform',
    y='total_sales',
    title='Need for Speed: Most Wanted\'s Sales Performance Across Platforms',
    labels={
        'platform': 'Platform',
        'total_sales': 'Sales (USD millions)'
    }
)

display(fig.show())

None

In [46]:
genre_sum = df.groupby('genre')['total_sales'].sum().reset_index().sort_values(by='total_sales', ascending=False)
genre_mean = df.groupby('genre')['total_sales'].mean().reset_index().sort_values(by='total_sales', ascending=False)

fig1 = px.bar(
    genre_sum,
    x='genre',
    y='total_sales',
    title='Total Global Game Sales by Genre',
    labels={
        'genre': 'Genre',
        'total_sales': 'Total Sales (USD millions)'
    }
)

fig2 = px.box(
    genre_mean,
    x='genre',
    y='total_sales',
    title='Average Global Game Sales by Genre',
    labels={
        'genre': 'Genre',
        'total_sales': 'Total Sales (USD millions)'
    }
)

display(fig1.show())
display(fig2.show())

None

None

## User Profiles by Region

In [27]:
df.sample()

Unnamed: 0,name,platform,year_of_release,genre,na_sales,eu_sales,jp_sales,other_sales,critic_score,user_score,rating,total_sales
7999,Battle of Giants: Dinosaurs Strike,Wii,2010.0,Strategy,0.14,0.03,0.0,0.01,-1.0,-1.0,E10+,0.18


In [28]:
df.groupby('platform')['eu_sales'].sum().reset_index().sort_values(by='eu_sales', ascending=False)

Unnamed: 0,platform,eu_sales
16,PS2,332.63
17,PS3,327.21
28,X360,268.32
26,Wii,258.32
15,PS,212.39
4,DS,188.41
18,PS4,141.09
13,PC,140.37
6,GBA,74.59
19,PSP,65.62


In [29]:
# creating function to determine the top 5 platforms or genres for each region

def user_profile(df: pd.DataFrame, region: str, feature: str):  
    grp = df.groupby(feature)[region].sum().reset_index().sort_values(by=region, ascending=False).reset_index(drop=True)
    return grp

In [30]:
eu_genre = user_profile(df, 'eu_sales', 'genre')
eu_platform = user_profile(df, 'eu_sales', 'platform')

na_genre = user_profile(df, 'na_sales', 'genre')
na_platform = user_profile(df, 'na_sales', 'platform')

jp_genre = user_profile(df, 'jp_sales', 'genre')
jp_platform = user_profile(df, 'jp_sales', 'platform')

In [47]:
# Adding a column in each dataframe with the corresponsing region in
# order to concatenate the dataframes for each region and genre/platform
eu_genre['region'] = 'EU'
eu_platform['region'] = 'EU'
na_genre['region'] = 'NA'
na_platform['region'] = 'NA'
jp_genre['region'] = 'JP'
jp_platform['region'] = 'JP'

# Combining all dataframes for genres
combined_genre_df = pd.concat([eu_genre.iloc[0:5], na_genre.iloc[0:5], jp_genre.iloc[0:5]])

# Creating a bar plot for genre distribution in each region
fig_genre = px.bar(combined_genre_df, x='genre', y=['eu_sales', 'na_sales', 'jp_sales'], color='region',
                    title='Distribution of Game Genre Sales in Each Region',
                    labels={'sales': 'Sales', 'genre': 'Genre'},
                    barmode='group')

# Combining all dataframes for platforms
combined_platform_df = pd.concat([eu_platform.iloc[0:5], na_platform.iloc[0:5], jp_platform.iloc[0:5]])

# Creating a bar plot for platform distribution in each region
fig_platform = px.bar(combined_platform_df, x='platform', y=['eu_sales', 'na_sales', 'jp_sales'], color='region',
                       title='Distribution of Gaming Platform Sales in Each Region',
                       labels={'sales': 'Sales', 'platform': 'Platform'},
                       barmode='group')

display(fig_genre.show())
display(fig_platform.show())

None

None

## Do ESRB ratings affect sales in individual regions?

In [48]:
# visualizing relationship between ESRB ratings and sales for each region

eu_esrb_vs_sales = px.scatter(
    df,
    x='rating',
    y='eu_sales',
    labels={
        'rating': 'Rating',
        'eu_sales': '(USD millions)'
    },
    title='ESRB Ratings vs. Sales in the EU Region'
)

na_esrb_vs_sales = px.scatter(
    df,
    x='rating',
    y='na_sales',
    labels={
        'rating': 'Rating',
        'na_sales': '(USD millions)'
    },
    title='ESRB Ratings vs. Sales in the NA Region'
)

jp_esrb_vs_sales = px.scatter(
    df,
    x='rating',
    y='jp_sales',
    labels={
        'rating': 'Rating',
        'jp_sales': 'Sales (USD millions)'
    },
    title='ESRB Ratings vs. Sales in the JP Region'
)

display(eu_esrb_vs_sales.show())
display(na_esrb_vs_sales.show())
display(jp_esrb_vs_sales.show())

None

None

None

<div class='alert alert-info'>

ESRB ratings do seem to affect sales similarly in all regions. Games rated E for everyone, M for mature, T for teen, & E10+ for everyone above the age of ten bring in the most sales across all three regions.
    
As you may remember from the data cleaning process, the value `0` represents missing ratings for games, meaning that each region's second best selling games based on ratings are games that do not have an official ESRB rating.

</div>

# Hypothesis Testing

### Hypothesis 1: Average user ratings of the Xbox One and PC platforms are the same.

**Statistical Test: Independent Samples T-Test**
- Since we are not testing a hypothesis on the company as a whole, but rather two different gaming platforms they sell on, we will test the hypothesis using a two-tailed independent samples test.

**Null Hypothesis (H0): Average user ratings of the Xbox One and PC platforms are the same**
- I chose this hypothesis because the null hypothesis is a statement of no difference

**Alternative Hypothesis (H1): Average user ratings of the Xbox One and PC platforms are different.**
- The alternative hypothesis is always the opposite of the assumption you are making about the data / what you are testing for. In this scenario that is the hypothesis that the average user ratings for the Xbox One & PC platforms are different.

**Alpha Value (Significance Level): 0.05**

I chose 5% as the significance level because it poses the least amount of risk of rejecting the Null Hypotheses when it is actually true.

In [33]:
# filtering the data into two datasets, one with all the xbox one data, & another with all the PC data
xbox1 = df[df['platform'] == 'XOne']
pc = df[df['platform'] == 'PC']

# performing the independent samples t-test
results = st.ttest_ind(xbox1['user_score'], pc['user_score'], equal_var=False)
alpha = 0.5

print(f"P-Value: {results.pvalue}")

if results.pvalue <= alpha:
    print("We can reject the null hypothesis that the average user scores for the Xbox One & PC are the same.")
else:
    print("We cannot reject the null hypothesis that the average user scores for the Xbox One & PC are different.")

P-Value: 0.0012202763945006337
We can reject the null hypothesis that the average user scores for the Xbox One & PC are the same.


### Hypothesis 2: Average user ratings for the Action and Sports genres are different.

**Statistical Test: Independent Samples T-Test**
- Since we are not testing a hypotheses on the company as a whole, but rather two different gaming genres they sell, we will test the hypotheses using a two-tailed independent samples test.

**Null Hypothesis (H0): Average user ratings for the Action and Sports genres are the same.**
- I chose this hypotheses because the null hypotheses is a statement of no difference, so in our scenario of testing whether the average user ratings for the Action and Sports genres are different, the null hypotheses would be the opposite of that.

**Alternative Hypothesis (H1): Average user ratings for the Action and Sports genres are different.**
- The alternative hypothesis always contradicts the null hypothesis. In this scenario that is the hypothesis that the Average user ratings for the Action and Sports genres are different.

**Alpha Value (Significance Level): 0.05**

I chose 5% as the significance level because it poses the least amount of risk of rejecting the Null Hypotheses when it is actually true.

In [34]:
# filtering the data into two datasets, one with all the Action genre data, & another with all the Sports genre data
action = df[df['genre'] == 'Action']
sports = df[df['genre'] == 'Sports']

# performing the independent samples t-test
results = st.ttest_ind(action['user_score'], sports['user_score'], equal_var=False)
alpha = 0.5

print(f"P-Value: {results.pvalue}")

if results.pvalue <= alpha:
    print("We can reject the null hypothesis that the average user scores for the Xbox One & PC are the same.")
else:
    print("We cannot reject the null hypothesis that the average user scores for the Xbox One & PC are different.")

P-Value: 7.595997985225105e-09
We can reject the null hypothesis that the average user scores for the Xbox One & PC are the same.


## Conclusion

- From the results of my analysis, I can confidently say that games which fall into one of the following buckets *- Rated E, Rated M, Rated T, Rated E10+, Action Games, or Sports Games -* will sell better than games which do not.
- I would also recommend that you increase your production of games made for the PS4 since it has become your top selling platform in recent years.
- There was sufficient evidence to reject the first null hypothesis that the average user scores for the Xbox One & PC are the same, as well as the second null hypothesis that the average user scores for the Xbox One & PC same 