In [None]:
# importing necessary dependencies
import pandas as pd
import datetime as dt
import warnings
import json
import pymongo
warnings.filterwarnings('ignore')

In [None]:
# The default port used by MongoDB is 27017
# https://docs.mongodb.com/manual/reference/default-mongodb-port/
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Define the 'video_game_db' database in Mongo
db = client.video_game_db

# clear all contents
db.destinations.delete_many({})

In [None]:
# Import CSVs

vg_sales = pd.read_csv("data/vgsales.csv")
salesDF = pd.DataFrame(vg_sales)

vg_review = pd.read_csv("data/all_games.csv")
reviewDF = pd.DataFrame(vg_review)

In [None]:
vg_sales.head(5)


In [None]:
# renaming name column in vs_review dataset to match salesDF so they can be joined on name column
vg_review.head(5)
vg_review.rename(columns={'name':'Name'}, inplace= True)

In [None]:
# Merge CSVs
DF = pd.merge(salesDF, reviewDF, how= 'inner', on= "Name")
DF.head()

# Drop summary, Platform
DF.drop(columns=['summary', 'Platform'], inplace= True)
DF.head(2)

In [None]:
# Clean column names
DF.rename(columns={'platform':'Platform', 
                  'release_date':'Release_Date', 
                  'meta_score': 'Meta_Score', 
                  'user_review': 'User_Review'},
          inplace= True)


# Convert Release_Date to datetime
DF['Release_Date'] = DF['Release_Date'].astype('datetime64')
DF.head(2)

# Drop Year column
DF.drop(columns=['Year'], inplace= True)
DF.head(2)


In [None]:
# Check for NaN values
# DF.isnull().values.any()
# DF.info()


# Drop Nan values
vg_DF = DF.dropna()
vg_DF.isnull().sum()

In [None]:
# Count duplicates if any
vg_DF.duplicated(keep=False).sum()



# Drop duplicates if any
# none


# Check column types for appropriate dtype
# vg_DF.info()

# Convert User_Review to float
# Drop rows in User_review that has 'tbd'
vg_DF.query('User_Review != "tbd"', inplace= True)



# Convert User_Review to float
# vg_DF['User_Review'].astype('float64')

vg_DF['User_Review'] = vg_DF['User_Review'].astype(float)

In [None]:
# Split release_date column into 3 seperate columns (day, month, year)
vg_DF['Release_Year'] = vg_DF['Release_Date'].dt.year
vg_DF['Release_Month'] = vg_DF['Release_Date'].dt.month
vg_DF['Release_Day'] = vg_DF['Release_Date'].dt.day


vg_DF.head(2)

In [None]:
# Reorder DF columns as desired


video_gameData = vg_DF[['Rank',
                      'Name', 
                      'Genre', 
                      'Publisher', 
                      'Platform', 
                      'Meta_Score', 
                      'User_Review', 
                      'Release_Date', 
                      'Release_Day', 
                      'Release_Month', 
                      'Release_Year', 
                      'Global_Sales', 
                      'NA_Sales', 
                      'EU_Sales', 
                      'JP_Sales', 
                      'Other_Sales']]
video_gameData