# Video Game Analysis

## 1. Import and Clean Data

### 1a. Import Libraries and Data

In [168]:
# Import Libraries
import numpy as np
import pandas as pd
import plotly_express as px

In [169]:
# Read in dataset
url = "https://raw.githubusercontent.com/kellyshreeve/Integrated_Project_1/main/moved_games.csv"
vg = pd.read_csv(url)

In [170]:
# View first 15 rows of the dataset 
vg.head(15)

Unnamed: 0,Name,Platform,Year_of_Release,Genre,NA_sales,EU_sales,JP_sales,Other_sales,Critic_Score,User_Score,Rating
0,Wii Sports,Wii,2006.0,Sports,41.36,28.96,3.77,8.45,76.0,8.0,E
1,Super Mario Bros.,NES,1985.0,Platform,29.08,3.58,6.81,0.77,,,
2,Mario Kart Wii,Wii,2008.0,Racing,15.68,12.76,3.79,3.29,82.0,8.3,E
3,Wii Sports Resort,Wii,2009.0,Sports,15.61,10.93,3.28,2.95,80.0,8.0,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,11.27,8.89,10.22,1.0,,,
5,Tetris,GB,1989.0,Puzzle,23.2,2.26,4.22,0.58,,,
6,New Super Mario Bros.,DS,2006.0,Platform,11.28,9.14,6.5,2.88,89.0,8.5,E
7,Wii Play,Wii,2006.0,Misc,13.96,9.18,2.93,2.84,58.0,6.6,E
8,New Super Mario Bros. Wii,Wii,2009.0,Platform,14.44,6.94,4.7,2.24,87.0,8.4,E
9,Duck Hunt,NES,1984.0,Shooter,26.93,0.63,0.28,0.47,,,


In [171]:
# Print dataset info
vg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16715 entries, 0 to 16714
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             16713 non-null  object 
 1   Platform         16715 non-null  object 
 2   Year_of_Release  16446 non-null  float64
 3   Genre            16713 non-null  object 
 4   NA_sales         16715 non-null  float64
 5   EU_sales         16715 non-null  float64
 6   JP_sales         16715 non-null  float64
 7   Other_sales      16715 non-null  float64
 8   Critic_Score     8137 non-null   float64
 9   User_Score       10014 non-null  object 
 10  Rating           9949 non-null   object 
dtypes: float64(6), object(5)
memory usage: 1.4+ MB


This dataset has a total of 16715 entries. There are missing values in Name, Year_of_Release, Genre, Critic_Score, User_Score, and Rating. Year_of_Release needs to be changed to int data type and and User_Score needs to be changed to float data type. Variable names should be changed to snake case.

### 1b. Rename Columns and Fix Data Types

In [172]:
# Change variable names to snake case
vg = vg.rename(
    columns={'Name':'name',
             'Platform':'platform',
             'Year_of_Release':'year_of_release',
             'Genre':'genre',
             'NA_sales':'na_sales',
             'EU_sales':'eu_sales',
             'JP_sales':'jp_sales',
             'Other_sales':'other_sales',
             'Critic_Score':'critic_score',
             'User_Score':'user_score',
             'Rating':'rating'    
})

print(vg.columns)

Index(['name', 'platform', 'year_of_release', 'genre', 'na_sales', 'eu_sales',
       'jp_sales', 'other_sales', 'critic_score', 'user_score', 'rating'],
      dtype='object')


In [173]:
# Change year_of_release to int data type

# Frist, check unique values of 'year_of_release'
print(vg['year_of_release'].unique())

[2006. 1985. 2008. 2009. 1996. 1989. 1984. 2005. 1999. 2007. 2010. 2013.
 2004. 1990. 1988. 2002. 2001. 2011. 1998. 2015. 2012. 2014. 1992. 1997.
 1993. 1994. 1982. 2016. 2003. 1986. 2000.   nan 1995. 1991. 1981. 1987.
 1980. 1983.]


The values are all whole numbers, so it is safe to convert 'year_of_release' to int.

In [174]:
# Convert 'year_of_release' to int
vg['year_of_release'] = vg['year_of_release'].astype('Int64')

type = vg['year_of_release'].dtypes
print(f'The dtype for "year_of_release" now is: {type}')

The dtype for "year_of_release" now is: Int64


In [175]:
# Print unique values in 'user_score'
print(vg['user_score'].unique())

['8' nan '8.3' '8.5' '6.6' '8.4' '8.6' '7.7' '6.3' '7.4' '8.2' '9' '7.9'
 '8.1' '8.7' '7.1' '3.4' '5.3' '4.8' '3.2' '8.9' '6.4' '7.8' '7.5' '2.6'
 '7.2' '9.2' '7' '7.3' '4.3' '7.6' '5.7' '5' '9.1' '6.5' 'tbd' '8.8' '6.9'
 '9.4' '6.8' '6.1' '6.7' '5.4' '4' '4.9' '4.5' '9.3' '6.2' '4.2' '6' '3.7'
 '4.1' '5.8' '5.6' '5.5' '4.4' '4.6' '5.9' '3.9' '3.1' '2.9' '5.2' '3.3'
 '4.7' '5.1' '3.5' '2.5' '1.9' '3' '2.7' '2.2' '2' '9.5' '2.1' '3.6' '2.8'
 '1.8' '3.8' '0' '1.6' '9.6' '2.4' '1.7' '1.1' '0.3' '1.5' '0.7' '1.2'
 '2.3' '0.5' '1.3' '0.2' '0.6' '1.4' '0.9' '1' '9.7']


user_score values include float, nan, and tbd. I will further explore the tbd value to see if it's associated with a specific year, country, or rating. 

In [176]:
# Check the dataset for patterns where user_score == tbd
display(vg[vg['user_score']=='tbd'].head(30))

Unnamed: 0,name,platform,year_of_release,genre,na_sales,eu_sales,jp_sales,other_sales,critic_score,user_score,rating
119,Zumba Fitness,Wii,2010.0,Sports,3.45,2.59,0.0,0.66,,tbd,E
301,Namco Museum: 50th Anniversary,PS2,2005.0,Misc,2.08,1.35,0.0,0.54,61.0,tbd,E10+
520,Zumba Fitness 2,Wii,2011.0,Sports,1.51,1.03,0.0,0.27,,tbd,T
645,uDraw Studio,Wii,2010.0,Misc,1.65,0.57,0.0,0.2,71.0,tbd,E
657,Frogger's Adventures: Temple of the Frog,GBA,,Adventure,2.15,0.18,0.0,0.07,73.0,tbd,E
718,Just Dance Kids,Wii,2010.0,Misc,1.52,0.54,0.0,0.18,,tbd,E
726,Dance Dance Revolution X2,PS2,2009.0,Simulation,1.09,0.85,0.0,0.28,,tbd,E10+
821,The Incredibles,GBA,2004.0,Action,1.15,0.77,0.04,0.1,55.0,tbd,E
881,Who wants to be a millionaire,PC,1999.0,Misc,1.94,0.0,0.0,0.0,,tbd,E
1047,Tetris Worlds,GBA,2001.0,Puzzle,1.25,0.39,0.0,0.06,65.0,tbd,E


Video Games with user_score of tbd are all associated with jp_sales of almost zero. I will check if there are any jp_sales of almost zero that have user ratings other than tbd to confirm whether this is the reason for the rating. 

In [177]:
# Check if all jp_sales close to zero have a user_score of tbd
display(vg[(vg['jp_sales'] >= 0) & (vg['jp_sales'] <= .05)].head(30))

Unnamed: 0,name,platform,year_of_release,genre,na_sales,eu_sales,jp_sales,other_sales,critic_score,user_score,rating
60,Call of Duty: Ghosts,X360,2013.0,Shooter,6.73,2.56,0.04,0.91,73.0,2.6,M
61,Just Dance 3,Wii,2011.0,Misc,5.95,3.11,0.0,1.06,74.0,7.8,E10+
66,Halo 4,X360,2012.0,Shooter,6.65,2.28,0.04,0.74,87.0,7,M
68,Just Dance 2,Wii,2010.0,Misc,5.8,2.85,0.01,0.78,74.0,7.3,E10+
72,Minecraft,X360,2013.0,Misc,5.7,2.65,0.02,0.81,,,
78,Halo 2,XB,2004.0,Shooter,6.82,1.53,0.05,0.08,95.0,8.2,M
85,The Sims 3,PC,2009.0,Simulation,0.99,6.42,0.0,0.6,86.0,7.6,T
89,Pac-Man,2600,1982.0,Puzzle,7.28,0.45,0.0,0.08,,,
99,Call of Duty: Black Ops 3,XOne,2015.0,Shooter,4.59,2.11,0.01,0.68,,,
100,Call of Duty: World at War,X360,2008.0,Shooter,4.81,1.88,0.0,0.69,84.0,7.6,M


There are user_scores other than tbd for other instances of jp_sales close to zero. It does not appear that these sales figures are the reason for the tbd rating. There do not appear to be any other patterns in year, genre, sales, critic_score, or rating that would explain the user_score of tbd. Because there are no clear patterns explaining this value, I will treat tbd as a missing value. 

In [178]:
# Fill user_score tbd with nan
vg['user_score'] = vg['user_score'].replace('tbd', np.nan)

print(vg['user_score'].unique())

['8' nan '8.3' '8.5' '6.6' '8.4' '8.6' '7.7' '6.3' '7.4' '8.2' '9' '7.9'
 '8.1' '8.7' '7.1' '3.4' '5.3' '4.8' '3.2' '8.9' '6.4' '7.8' '7.5' '2.6'
 '7.2' '9.2' '7' '7.3' '4.3' '7.6' '5.7' '5' '9.1' '6.5' '8.8' '6.9' '9.4'
 '6.8' '6.1' '6.7' '5.4' '4' '4.9' '4.5' '9.3' '6.2' '4.2' '6' '3.7' '4.1'
 '5.8' '5.6' '5.5' '4.4' '4.6' '5.9' '3.9' '3.1' '2.9' '5.2' '3.3' '4.7'
 '5.1' '3.5' '2.5' '1.9' '3' '2.7' '2.2' '2' '9.5' '2.1' '3.6' '2.8' '1.8'
 '3.8' '0' '1.6' '9.6' '2.4' '1.7' '1.1' '0.3' '1.5' '0.7' '1.2' '2.3'
 '0.5' '1.3' '0.2' '0.6' '1.4' '0.9' '1' '9.7']


In [179]:
# Change user_score to float type
vg['user_score'] = pd.to_numeric(vg['user_score'])

vg.dtypes

name                object
platform            object
year_of_release      Int64
genre               object
na_sales           float64
eu_sales           float64
jp_sales           float64
other_sales        float64
critic_score       float64
user_score         float64
rating              object
dtype: object

All variables are now the correct data type.

### 1c. Address Missing Values

In [180]:
# Count missing values in each variable
print('The number of missing values in each variable:')

print(vg.isna().sum())

The number of missing values in each variable:
name                  2
platform              0
year_of_release     269
genre                 2
na_sales              0
eu_sales              0
jp_sales              0
other_sales           0
critic_score       8578
user_score         9125
rating             6766
dtype: int64


There are missing values for name, year_of_release, genre, critic_score, user_score, and rating. 

I will display missing values for each variable to identify patterns and determine whether and how to fill the values.

In [181]:
# Display the missing values for name
print('The missing values for name:')

display(vg[vg['name'].isna()])

The missing values for name:


Unnamed: 0,name,platform,year_of_release,genre,na_sales,eu_sales,jp_sales,other_sales,critic_score,user_score,rating
659,,GEN,1993,,1.78,0.53,0.0,0.08,,,
14244,,GEN,1993,,0.0,0.0,0.03,0.0,,,


The two rows missing on name are also the two rows missing on genre. These rows are additionally missing critic_score, user_score, and rating but do have complete information for platform, year, and sales. Because one of the major questions for this analysis is whether platform and year of release are related to sales, I will leave these rows and address the missing values. 

For now, I will fill name and genre with 'unknown' and address the critic_score, user_score, and rating later on.

In [182]:
# Fill missing values in name with unknown
vg['name'] = vg['name'].fillna('unknown')

# Fill missing values in genre with unknown
vg['genre'] = vg['genre'].fillna('unknown')

In [184]:
# Display missing values for year_of_release
print('A sample of rows with missing values for year_of_release:')
display(vg[vg['year_of_release'].isna()].head(15))

A sample of rows with missing values for year_of_release:


Unnamed: 0,name,platform,year_of_release,genre,na_sales,eu_sales,jp_sales,other_sales,critic_score,user_score,rating
183,Madden NFL 2004,PS2,,Sports,4.26,0.26,0.01,0.71,94.0,8.5,E
377,FIFA Soccer 2004,PS2,,Sports,0.59,2.36,0.04,0.51,84.0,6.4,E
456,LEGO Batman: The Videogame,Wii,,Action,1.8,0.97,0.0,0.29,74.0,7.9,E10+
475,wwe Smackdown vs. Raw 2006,PS2,,Fighting,1.57,1.02,0.0,0.41,,,
609,Space Invaders,2600,,Shooter,2.36,0.14,0.0,0.03,,,
627,Rock Band,X360,,Misc,1.93,0.33,0.0,0.21,92.0,8.2,T
657,Frogger's Adventures: Temple of the Frog,GBA,,Adventure,2.15,0.18,0.0,0.07,73.0,,E
678,LEGO Indiana Jones: The Original Adventures,Wii,,Action,1.51,0.61,0.0,0.21,78.0,6.6,E10+
719,Call of Duty 3,Wii,,Shooter,1.17,0.84,0.0,0.23,69.0,6.7,T
805,Rock Band,Wii,,Misc,1.33,0.56,0.0,0.2,80.0,6.3,T


There are not any apparent patterns in platform, genre, sales, score, or rating that explain the missing year_of_release values. A google search of the names shows that these games were released in all different years. Because a major portion of this analysis is to determine patterns based on year of release, filling these 236 values with the mean or median could skew the results in favor of that year. 

Therefore, I will not fill these missing values and leave these games out of analyses including year_of_release.

In [185]:
# Display missing values for critic_score
print('A sample of missing values for critic_score:')

display(vg[vg['critic_score'].isna()].head(15))

A sample of missing values for critic_score:


Unnamed: 0,name,platform,year_of_release,genre,na_sales,eu_sales,jp_sales,other_sales,critic_score,user_score,rating
1,Super Mario Bros.,NES,1985,Platform,29.08,3.58,6.81,0.77,,,
4,Pokemon Red/Pokemon Blue,GB,1996,Role-Playing,11.27,8.89,10.22,1.0,,,
5,Tetris,GB,1989,Puzzle,23.2,2.26,4.22,0.58,,,
9,Duck Hunt,NES,1984,Shooter,26.93,0.63,0.28,0.47,,,
10,Nintendogs,DS,2005,Simulation,9.05,10.95,1.93,2.74,,,
12,Pokemon Gold/Pokemon Silver,GB,1999,Role-Playing,9.0,6.18,7.2,0.71,,,
18,Super Mario World,SNES,1990,Platform,12.78,3.75,3.54,0.55,,,
20,Pokemon Diamond/Pokemon Pearl,DS,2006,Role-Playing,6.38,4.46,6.04,1.36,,,
21,Super Mario Land,GB,1989,Platform,10.83,2.71,4.18,0.42,,,
22,Super Mario Bros. 3,NES,1988,Platform,9.54,3.44,3.84,0.46,,,


All rows missing critic score in this sample are also missing user_score and rating. There appears to be a large pattern in the missing values of these variables. Missing value imputation requires that there are not large patterns in missing values in the dataset. Therefore, imputing the appropriate mean, median, or mode for these values across the board would be a statistically inappropriate way to handle these values.

However, while these three variables had similar numbers of missing values, their counts weren't identical, meaning there are some rows that are missing only one or two of the scores. I will use imputation of mean or median for critic_score and user_score on rows that are missing only one or the other. For rows that are missing only user_score or critic_score, I will fill the rating value with 'unknown.'

Rows that are missing simultaneously on critic_score, user_score, and rating will be left out of analyses using these values.

In [186]:
# Find the number of rows that are missing on critic_score, user_score, and rating
cs_us_rat_missing = len(vg[(vg['critic_score'].isna() & (vg['user_score'].isna()) & (vg['rating'].isna()))])

print(f'The number of rows missing on critic_score, user_score, and rating: {cs_us_rat_missing}')

The number of rows missing on critic_score, user_score, and rating: 6667


In [199]:
# Fill ratings in rows with either critic_score or user_score or both with 'unknown'
s1 = (vg['critic_score'].isna()) & (vg['user_score'].notna()) 
s2 = (vg['critic_score'].notna()) & (vg['user_score'].isna())
s3 = (vg['critic_score'].notna()) & (vg['user_score'].notna())

vg.loc[s1,'rating'] = vg.loc[s1, 'rating'].fillna(value='unknown') 
vg.loc[s2,'rating'] = vg.loc[s2, 'rating'].fillna(value='unknown') 
vg.loc[s3,'rating'] = vg.loc[s3, 'rating'].fillna(value='unknown')

In [65]:
# Check distribution for critic_score
cs_hist = px.histogram(vg, x='critic_score', title='Critic Score')

cs_hist.update_layout({
    'plot_bgcolor':'rgba(0, 0, 0, 0)',
    'paper_bgcolor':'rgba(0, 0, 0, 0)'
})

cs_hist.show()

critic_score is left-skewed, so I will fill missing values with the median.

In [66]:
# Check distribution of user_score
us_hist = px.histogram(vg, x='user_score', title='User Score')

us_hist.update_layout({
    'plot_bgcolor':'rgba(0, 0, 0, 0)',
    'paper_bgcolor':'rgba(0, 0, 0, 0)'
})

us_hist.show()

user_score is highly left skewed, so I will fill missing values with the median.

In [201]:
# Fill rows that are missing on user_score but not missing on critic_score with the median of user_score
s1 = vg['critic_score'].notna()

vg.loc[s1, 'user_score'] = vg.loc[s1, 'user_score'].fillna(vg['user_score'].median())

name                  0
platform              0
year_of_release     269
genre                 0
na_sales              0
eu_sales              0
jp_sales              0
other_sales           0
critic_score       8578
user_score         8005
rating             6667
dtype: int64


Missing user_scores are replaced with the median of user_score in rows where critic_score is present. User_scores have been left as missing values in rows that are also missing critic_score.

In [203]:
# Fill missing critic_scores that have a user_score with the median of critic_score
s1 = vg['user_score'].notna()

vg.loc[s1, 'critic_score'] = vg.loc[s1, 'critic_score'].fillna(vg['critic_score'].median())

name                  0
platform              0
year_of_release     269
genre                 0
na_sales              0
eu_sales              0
jp_sales              0
other_sales           0
critic_score       8005
user_score         8005
rating             6667
dtype: int64


In [213]:
# Print missing values that are left
print('After logical imputation, these are the missing values still in dataset:')
print()
print(vg.isna().sum())

After logical imputation, these are the missing values still in dataset:

name                  0
platform              0
year_of_release     269
genre                 0
na_sales              0
eu_sales              0
jp_sales              0
other_sales           0
critic_score       8005
user_score         8005
rating             6667
dtype: int64


After filling missing names and genres with 'unknown', filling critic_scores that had a user_score and user_scores that had a crtic_score with their respective medians, and 'unknown' for ratings that had either a critic_score or a user_score, these are the missing values left in the dataset. 

While there are still missing values, I argue that based on the analysis, it doesn't maek sense to impute the median for year. This is because year in this analysis is treated as categorical data, in that we will compare sales and scores across years. Therefore, imputing the mean or median of year would unduely weight that year's category. There are only 269 missing years. I will leave these values as missing.

Because of the large pattern in missing data for critic_score, user_score, and rating, I argue that it is not valid to impute values for games that are missing on all three values. I have imputed the median for critic_score where there was a user_score and the median user_score where there was a critic_score. I will assume that these missing values were due to another reason and not part of the larger pattern of missing across all three. 

Missing ratings were replaced with 'unknown' in rows that had values for at least one of critic_score or user_score.

I am confident in these missing value replacement strategies and will procede with the analysis using the remaining data.

### 1d. Check for Duplicates

In [207]:
# Check for fully duplicate rows
vg['name'] = vg['name'].str.lower()
vg['platform'] = vg['platform'].str.lower()

duplicates = vg.duplicated().sum()

print(f'The number of fully duplicate rows is: {duplicates}')

The number of fully duplicate rows is: 0


In [208]:
# Check for implicit duplicate name - platform - year duplicates
name_plat_duplicates = vg[['name', 'platform', 'year_of_release']].duplicated().sum()

print(f'The number of name-platform-year duplicates is: {name_plat_duplicates}')

The number of name-platform-year duplicates is: 2


In [209]:
# View the 2 duplicated rows
print('The two rows with duplicates are:')
display(vg[vg[['name', 'platform', 'year_of_release']].duplicated()==True])

The two rows with duplicates are:


Unnamed: 0,name,platform,year_of_release,genre,na_sales,eu_sales,jp_sales,other_sales,critic_score,user_score,rating
14244,unknown,gen,1993,unknown,0.0,0.0,0.03,0.0,,,
16230,madden nfl 13,ps3,2012,Sports,0.0,0.01,0.0,0.0,83.0,5.5,E


In [210]:
# Display the original row and duplicate for the first duplicated row
print('The first duplicated rows are:')
display(vg[(vg['name']=='unknown') & (vg['platform']=='gen') & (vg['year_of_release']==1993)])

The first duplicated rows are:


Unnamed: 0,name,platform,year_of_release,genre,na_sales,eu_sales,jp_sales,other_sales,critic_score,user_score,rating
659,unknown,gen,1993,unknown,1.78,0.53,0.0,0.08,,,
14244,unknown,gen,1993,unknown,0.0,0.0,0.03,0.0,,,


These two rows are identical other than na_sales, eu_sales, jp_sales, and other_sales. Because the sales figures are almost zero for the second row, I believe the second row is a mistake. I will delete the second row.

In [211]:
# Display the original row and the duplicate for the second duplicated row
print('The second duplicated rows are:')
display(vg[(vg['name']=='madden nfl 13') & (vg['platform']=='ps3') & (vg['year_of_release']==2012)])

The second duplicated rows are:


Unnamed: 0,name,platform,year_of_release,genre,na_sales,eu_sales,jp_sales,other_sales,critic_score,user_score,rating
604,madden nfl 13,ps3,2012,Sports,2.11,0.22,0.0,0.23,83.0,5.5,E
16230,madden nfl 13,ps3,2012,Sports,0.0,0.01,0.0,0.0,83.0,5.5,E


These two rows are identical other than na_sales, eu_sales, jp_sales, and other_sales. Because the sales figures are almost zero for the second row, I believe the second row is a mistake. I will delete the second row.


In [53]:
# Drop the implicit duplicate rows
vg = vg.drop_duplicates(subset=['name', 'platform', 'year_of_release']).reset_index(drop=True)

In [54]:
# Check the duplicate name-platform-year are removed
name_plat_duplicates_2 = vg[['name', 'platform', 'year_of_release']].duplicated().sum()

print(f'The number of name-platform-year duplicates now is: {name_plat_duplicates_2}')

The number of name-platform-year duplicates now is: 0


### 1e. Add Additional Features

In [214]:
# Calculate total sum of sales across all regions
vg['total_sales'] = vg.iloc[:, 4:8].sum(axis=1)

vg.head()

Unnamed: 0,name,platform,year_of_release,genre,na_sales,eu_sales,jp_sales,other_sales,critic_score,user_score,rating,total_sales
0,wii sports,wii,2006,Sports,41.36,28.96,3.77,8.45,76.0,8.0,E,82.54
1,super mario bros.,nes,1985,Platform,29.08,3.58,6.81,0.77,,,,40.24
2,mario kart wii,wii,2008,Racing,15.68,12.76,3.79,3.29,82.0,8.3,E,35.52
3,wii sports resort,wii,2009,Sports,15.61,10.93,3.28,2.95,80.0,8.0,E,32.77
4,pokemon red/pokemon blue,gb,1996,Role-Playing,11.27,8.89,10.22,1.0,,,,31.38


## 2. Analyze the data

### 2a. Games Released by Year

In [222]:
# Find the number of games released by year
games_by_year = vg.groupby('year_of_release')['name'].count().reset_index()

display(games_by_year.head())

Unnamed: 0,year_of_release,name
0,1980,9
1,1981,46
2,1982,36
3,1983,17
4,1984,14


In [234]:
# Create bar graph of video games released by year
year_bar = px.bar(games_by_year, x='year_of_release', y='name',
                  title='Games Released by Year', 
                  labels={'year_of_release':'Year of Release', 'name':'Count'})

year_bar.update_layout({
    'plot_bgcolor':'rgba(0, 0, 0, 0)',
    'paper_bgcolor':'rgba(0, 0, 0, 0)'
}) 

year_bar.update_xaxes(showgrid=False)
year_bar.update_yaxes(range=[0, 1800], showgrid=False) 


year_bar.show()


## 3. Create a User Profile for Each Region

## 4. Test Hypotheses

## 5. Conclusion