# Apple Data Cleanup
_______________________

* Consideration: source data was scraped from the web

## Objectives:

* Create a cleaned up version of the Apple Store Source Data by filtering:

 - Games with no reviews
 - Duplicates
 - Converting all ratings, reviews, installs, and price to uniform types and formats by column
 

* Subsequently, make sure there's no duplicate app names or double counting / aggegration; organize by apps, and remove exact duplicates, and or take the higher of the two


* Final Product consist of a Clean CSV File, Pie Chart and Bar Graph,

In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#Displaying manually clean dataset
read = "Data Sheets/app-store-apple-data-set-10k-apps/AppleStoreClean.csv"
apple_df = pd.read_csv(read)
apple_df

FileNotFoundError: File b'Data Sheets/app-store-apple-data-set-10k-apps/AppleStoreClean.csv' does not exist

In [None]:
#Elimanting values in our dataset (lower 75%) in order to get a smaller sample size
top_quartile = np.percentile(apple_df['Total Ratings'], 75)
top_quartile

In [3]:
#Creating a new dataframe which shows the top 25%
top_quartile_data_df = apple_df.loc[apple_df['Total Ratings'] > top_quartile]
top_quartile_data_df

NameError: name 'apple_df' is not defined

In [4]:
#Combining certain Categories in order to clean up the data even more and reducing the number of category for our pie and bar charts

top_quartile_data_df['Category'] = [x.replace("Book","Education") for x in top_quartile_data_df['Category']]
top_quartile_data_df['Category'] = [x.replace("News","Education") for x in top_quartile_data_df['Category']]

top_quartile_data_df['Category'] = [x.replace("Food & Drink","Lifestyle") for x in top_quartile_data_df['Category']]
top_quartile_data_df['Category'] = [x.replace("Music","Lifestyle") for x in top_quartile_data_df['Category']]
top_quartile_data_df['Category'] = [x.replace("Shopping","Lifestyle") for x in top_quartile_data_df['Category']]
top_quartile_data_df['Category'] = [x.replace("Health & Fitness","Lifestyle") for x in top_quartile_data_df['Category']]
top_quartile_data_df['Category'] = [x.replace("Entertainment","Lifestyle") for x in top_quartile_data_df['Category']]

top_quartile_data_df['Category'] = [x.replace("Weather","Travel") for x in top_quartile_data_df['Category']]
top_quartile_data_df['Category'] = [x.replace("Navigation","Travel") for x in top_quartile_data_df['Category']]

top_quartile_data_df['Category'] = [x.replace("Reference","Utility") for x in top_quartile_data_df['Category']]
top_quartile_data_df['Category'] = [x.replace("Photo & Video","Utility") for x in top_quartile_data_df['Category']]
top_quartile_data_df['Category'] = [x.replace("Utilities","Utility") for x in top_quartile_data_df['Category']]
top_quartile_data_df['Category'] = [x.replace("Finance","Business") for x in top_quartile_data_df['Category']]

NameError: name 'top_quartile_data_df' is not defined

In [None]:
#See all the individual categories
categories = top_quartile_data_df.Category.unique()
print(categories)

#Count how many apps are in these individual categories
top_quartile_data_df.groupby('Category').size()

#apple_df.sort_values('Category', ascending=True)

In [None]:
#Creating the Pie Chart
members = [16, 21, 377, 98, 15, 32, 15, 24, 43]
categories = ["Business","Education","Games","Lifestyle","Productivity","Social Networking","Sports","Travel","Utlity"]
explode = (0, 0, 0.05, 0, 0, 0, 0, 0, 0)
plt.pie(members, labels=categories, explode=explode, shadow=False, startangle=45)
plt.title("Apple Data")
plt.axis("equal")

#Saving the pie chart
plt.savefig("./Apple_Pie_Chart.png")

In [None]:
#Exporting the final clean data to it's own CSV file
top_quartile_data_df.to_csv("Final_Apple.csv", encoding='utf-8')


In [None]:
top_quartile_data_df = pd.read_csv("Final_Apple.csv", encoding='utf-8')

In [None]:
top_quartile_data_df

In [None]:
facebook_rating = top_quartile_data_df.loc[top_quartile_data_df['App'] == 'Facebook']
facebook_rating

In [None]:
facebook_rating = top_quartile_data_df.loc[top_quartile_data_df['App'] == 'Facebook']
facebook_rating

In [None]:
instagram_rating = top_quartile_data_df.loc[top_quartile_data_df['App'] == 'Instagram']
instagram_rating

In [None]:
coc_rating = top_quartile_data_df.loc[top_quartile_data_df['App'] == 'Clash of Clans']
coc_rating

In [None]:
templerun_rating = top_quartile_data_df.loc[top_quartile_data_df['App'] == 'Temple Run']
templerun_rating

In [None]:
pandora_rating = top_quartile_data_df.loc[top_quartile_data_df['App'] == 'Pandora - Music & Radio']
pandora_rating

In [None]:
pinterest_rating = top_quartile_data_df.loc[top_quartile_data_df['App'] == 'Pinterest']
pinterest_rating

In [None]:
bible_rating = top_quartile_data_df.loc[top_quartile_data_df['App'] == 'Bible']
bible_rating

In [None]:
candycrushsaga_rating = top_quartile_data_df.loc[top_quartile_data_df['App'] == 'Candy Crush Saga']
candycrushsaga_rating

In [None]:
spotify_rating = top_quartile_data_df.loc[top_quartile_data_df['App'] == 'Spotify Music']
spotify_rating

In [None]:
angrybirds_rating = top_quartile_data_df.loc[top_quartile_data_df['App'] == 'Angry Birds']
angrybirds_rating

# Creating bar charts for Categories v Average Rating

In [None]:
#For our bar chart we want the x-axis = categories, y-axis = average rating

#Creating a new dataframe with just Category and Specific Average Ratings
new_df = top_quartile_data_df[['Category', 'Average Ratings']].copy()
new_df

In [None]:
#Finding the average for each category

#Business
business = new_df[new_df['Category'].str.contains("Business")].mean()

#Education
education = new_df[new_df['Category'].str.contains("Education")].mean()

#Games
games = new_df[new_df['Category'].str.contains("Games")].mean()

#Lifestyle
lifestyle = new_df[new_df['Category'].str.contains("Lifestyle")].mean()

#Productivity
productivity = new_df[new_df['Category'].str.contains("Productivity")].mean()

#Social Networking
social = new_df[new_df['Category'].str.contains("Social Networking")].mean()

#Sports
sports = new_df[new_df['Category'].str.contains("Sports")].mean()

#Travel
travel = new_df[new_df['Category'].str.contains("Travel")].mean()

#Utility
utility = new_df[new_df['Category'].str.contains("Utility")].mean()

ys = [business, education, games, lifestyle, productivity, social, sports, travel, utility]
print(ys)

In [None]:
#Charting out the bar graph
x = ["Business","Education","Games","Lifestyle","Productivity","Social Networking","Sports","Travel","Utlity"]
y = [4.41, 4.02, 4.30, 4.14, 4.37, 3.70, 3.47, 4.0, 3.91 ]

plt.bar(x, y, color='b', alpha=0.5, align="center")
plt.xticks(rotation="45")
plt.title("Apple Store: Categories vs Average Rating")
plt.xlabel("Categories")
plt.ylabel("Average Rating")
plt.savefig("./Apple_Bar_Graph.png", bbox_inches='tight')
