# APPLE APP STORE - Transforming Raw to Clean Data
_______________________

* Consideration: source data was scraped from the web

## Objectives:

* Create a cleaned up version of the Apple Store Source Data by filtering:

 - Games with no reviews
 - Duplicates
 - Converting all ratings, reviews, installs, and price to uniform types and formats by column
 

* Subsequently, make sure there's no duplicate app names or double counting / aggegration; organize by apps, and remove exact duplicates, and or take the higher of the two


* Final Product consist of a Clean CSV File, Pie Chart and Bar Graph,

In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Displaying manually clean dataset
#read = "./resources/final_clean/Final_Apple.csv"
read = "./resources/original_raw_data/appleappstore.csv"
apple_df = pd.read_csv(read)
apple_df

Unnamed: 0.1,Unnamed: 0,id,track_name,size_bytes,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic
0,1,281656475,PAC-MAN Premium,100788224,USD,3.99,21292,26,4.0,4.5,6.3.5,4+,Games,38,5,10,1
1,2,281796108,Evernote - stay organized,158578688,USD,0.00,161065,26,4.0,3.5,8.2.2,4+,Productivity,37,5,23,1
2,3,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",100524032,USD,0.00,188583,2822,3.5,4.5,5.0.0,4+,Weather,37,5,3,1
3,4,282614216,"eBay: Best App to Buy, Sell, Save! Online Shop...",128512000,USD,0.00,262241,649,4.0,4.5,5.10.0,12+,Shopping,37,5,9,1
4,5,282935706,Bible,92774400,USD,0.00,985920,5320,4.5,5.0,7.5.1,4+,Reference,37,5,45,1
5,6,283619399,Shanghai Mahjong,10485713,USD,0.99,8253,5516,4.0,4.0,1.8,4+,Games,47,5,1,1
6,7,283646709,PayPal - Send and request money safely,227795968,USD,0.00,119487,879,4.0,4.5,6.12.0,4+,Finance,37,0,19,1
7,8,284035177,Pandora - Music & Radio,130242560,USD,0.00,1126879,3594,4.0,4.5,8.4.1,12+,Music,37,4,1,1
8,9,284666222,PCalc - The Best Calculator,49250304,USD,9.99,1117,4,4.5,5.0,3.6.6,4+,Utilities,37,5,1,1
9,10,284736660,Ms. PAC-MAN,70023168,USD,3.99,7885,40,4.0,4.0,4.0.4,4+,Games,38,0,10,1


In [3]:
# Preview columns and count of each
apple_df.count()

Unnamed: 0          7197
id                  7197
track_name          7197
size_bytes          7197
currency            7197
price               7197
rating_count_tot    7197
rating_count_ver    7197
user_rating         7197
user_rating_ver     7197
ver                 7197
cont_rating         7197
prime_genre         7197
sup_devices.num     7197
ipadSc_urls.num     7197
lang.num            7197
vpp_lic             7197
dtype: int64

In [4]:
# Drop unnecessary columns
cols = [0, 3, 7, -7, -4, -3, -2, -1]
apple_df.drop(apple_df.columns[cols], axis=1, inplace=True)
apple_df

Unnamed: 0,id,track_name,currency,price,rating_count_tot,user_rating,user_rating_ver,cont_rating,prime_genre
0,281656475,PAC-MAN Premium,USD,3.99,21292,4.0,4.5,4+,Games
1,281796108,Evernote - stay organized,USD,0.00,161065,4.0,3.5,4+,Productivity
2,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",USD,0.00,188583,3.5,4.5,4+,Weather
3,282614216,"eBay: Best App to Buy, Sell, Save! Online Shop...",USD,0.00,262241,4.0,4.5,12+,Shopping
4,282935706,Bible,USD,0.00,985920,4.5,5.0,4+,Reference
5,283619399,Shanghai Mahjong,USD,0.99,8253,4.0,4.0,4+,Games
6,283646709,PayPal - Send and request money safely,USD,0.00,119487,4.0,4.5,4+,Finance
7,284035177,Pandora - Music & Radio,USD,0.00,1126879,4.0,4.5,12+,Music
8,284666222,PCalc - The Best Calculator,USD,9.99,1117,4.5,5.0,4+,Utilities
9,284736660,Ms. PAC-MAN,USD,3.99,7885,4.0,4.0,4+,Games


In [None]:
# Rename Columns
apple_df.rename(columns={'track_name': 'App', 'rating_count_total': 'Total Number of Ratings'}, inplace=True)

In [None]:
#Elimanting values in our dataset (lower 75%) in order to get a smaller sample size
top_quartile = np.percentile(apple_df['Total Ratings'], 75)
top_quartile

In [None]:
#Creating a new dataframe which shows the top 25%
top_quartile_data_df = apple_df.loc[apple_df['Total Ratings'] > top_quartile]
top_quartile_data_df

In [None]:
#Combining certain Categories in order to clean up the data even more and reducing the number of category for our pie and bar charts

top_quartile_data_df['Category'] = [x.replace("Book","Education") for x in top_quartile_data_df['Category']]
top_quartile_data_df['Category'] = [x.replace("News","Education") for x in top_quartile_data_df['Category']]

top_quartile_data_df['Category'] = [x.replace("Food & Drink","Lifestyle") for x in top_quartile_data_df['Category']]
top_quartile_data_df['Category'] = [x.replace("Music","Lifestyle") for x in top_quartile_data_df['Category']]
top_quartile_data_df['Category'] = [x.replace("Shopping","Lifestyle") for x in top_quartile_data_df['Category']]
top_quartile_data_df['Category'] = [x.replace("Health & Fitness","Lifestyle") for x in top_quartile_data_df['Category']]
top_quartile_data_df['Category'] = [x.replace("Entertainment","Lifestyle") for x in top_quartile_data_df['Category']]

top_quartile_data_df['Category'] = [x.replace("Weather","Travel") for x in top_quartile_data_df['Category']]
top_quartile_data_df['Category'] = [x.replace("Navigation","Travel") for x in top_quartile_data_df['Category']]

top_quartile_data_df['Category'] = [x.replace("Reference","Utility") for x in top_quartile_data_df['Category']]
top_quartile_data_df['Category'] = [x.replace("Photo & Video","Utility") for x in top_quartile_data_df['Category']]
top_quartile_data_df['Category'] = [x.replace("Utilities","Utility") for x in top_quartile_data_df['Category']]
top_quartile_data_df['Category'] = [x.replace("Finance","Business") for x in top_quartile_data_df['Category']]

In [None]:
#See all the individual categories
categories = top_quartile_data_df.Category.unique()
print(categories)

#Count how many apps are in these individual categories
top_quartile_data_df.groupby('Category').size()

#apple_df.sort_values('Category', ascending=True)

In [None]:
# Creating the Pie Chart
plt.figure(1, figsize=(6.5,6.5))
members = [16, 21, 377, 98, 15, 32, 15, 24, 43]
categories = ["Business","Education","Games","Lifestyle","Productivity","Social Networking","Sports","Travel","Utlity"]
explode = (0, 0, 0.05, 0, 0, 0, 0, 0, 0)
plt.pie(members, labels=categories, explode=explode, shadow=False, startangle=45)
plt.title("Apple Data")
plt.axis("equal")

#Saving the pie chart
plt.savefig("./Apple_Pie_Chart.png")

In [None]:
#Exporting the final clean data to it's own CSV file
#top_quartile_data_df.to_csv("Final_Apple.csv", encoding='utf-8')


In [None]:
top_quartile_data_df = pd.read_csv("./resources/final_clean/Final_Apple.csv", encoding='utf-8')

In [None]:
cols=[0,1]
top_quartile_data_df.drop(top_quartile_data_df.columns[cols],axis=1,inplace=True)

In [None]:
top_quartile_data_df

In [None]:
facebook_rating = top_quartile_data_df.loc[top_quartile_data_df['App'] == 'Facebook']
facebook_rating

In [None]:
facebook_rating = top_quartile_data_df.loc[top_quartile_data_df['App'] == 'Facebook']
facebook_rating

In [None]:
instagram_rating = top_quartile_data_df.loc[top_quartile_data_df['App'] == 'Instagram']
instagram_rating

In [None]:
coc_rating = top_quartile_data_df.loc[top_quartile_data_df['App'] == 'Clash of Clans']
coc_rating

In [None]:
templerun_rating = top_quartile_data_df.loc[top_quartile_data_df['App'] == 'Temple Run']
templerun_rating

In [None]:
pandora_rating = top_quartile_data_df.loc[top_quartile_data_df['App'] == 'Pandora - Music & Radio']
pandora_rating

In [None]:
pinterest_rating = top_quartile_data_df.loc[top_quartile_data_df['App'] == 'Pinterest']
pinterest_rating

In [None]:
bible_rating = top_quartile_data_df.loc[top_quartile_data_df['App'] == 'Bible']
bible_rating

In [None]:
candycrushsaga_rating = top_quartile_data_df.loc[top_quartile_data_df['App'] == 'Candy Crush Saga']
candycrushsaga_rating

In [None]:
spotify_rating = top_quartile_data_df.loc[top_quartile_data_df['App'] == 'Spotify Music']
spotify_rating

In [None]:
angrybirds_rating = top_quartile_data_df.loc[top_quartile_data_df['App'] == 'Angry Birds']
angrybirds_rating

# Creating bar charts for Categories v Average Rating

In [None]:
#For our bar chart we want the x-axis = categories, y-axis = average rating

#Creating a new dataframe with just Category and Specific Average Ratings
new_df = top_quartile_data_df[['Category', 'Average Ratings']].copy()
new_df

In [None]:
#Finding the average for each category

#Business
business = new_df[new_df['Category'].str.contains("Business")].mean()

#Education
education = new_df[new_df['Category'].str.contains("Education")].mean()

#Games
games = new_df[new_df['Category'].str.contains("Games")].mean()

#Lifestyle
lifestyle = new_df[new_df['Category'].str.contains("Lifestyle")].mean()

#Productivity
productivity = new_df[new_df['Category'].str.contains("Productivity")].mean()

#Social Networking
social = new_df[new_df['Category'].str.contains("Social Networking")].mean()

#Sports
sports = new_df[new_df['Category'].str.contains("Sports")].mean()

#Travel
travel = new_df[new_df['Category'].str.contains("Travel")].mean()

#Utility
utility = new_df[new_df['Category'].str.contains("Utility")].mean()

ys = [business, education, games, lifestyle, productivity, social, sports, travel, utility]
print(ys)

In [None]:
#Charting out the bar graph
x = ["Business","Education","Games","Lifestyle","Productivity","Social Networking","Sports","Travel","Utlity"]
y = [4.41, 4.02, 4.30, 4.14, 4.37, 3.70, 3.47, 4.0, 3.91 ]

plt.bar(x, y, color='b', alpha=0.5, align="center")
plt.xticks(rotation="45")
plt.title("Apple Store: Categories vs Average Rating")
plt.xlabel("Categories")
plt.ylabel("Average Rating")
plt.savefig("./Apple_Bar_Graph.png", bbox_inches='tight')
