In [1]:
import numpy as np
import pandas as pd
import pymongo

In [2]:
def read_mongo(collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=True):
    """ Read from Mongo and Store into DataFrame """
    # Connect to MongoDB and Make a query to the specific DB and Collection
    with pymongo.MongoClient(host, port) as client:
        table = client.appstore[collection]
        df = pd.DataFrame(list(table.find(query)))
        
    # Delete the _id
    if no_id:
        del df['_id']

    return df

In [18]:
apps_df = read_mongo('appitems')

In [19]:
apps_df.shape #5658 unique apps

(5658, 26)

In [20]:
apps_df.head().T

Unnamed: 0,0,1,2,3,4
category,Finance,Finance,Finance,Finance,Finance
current_rating,1.8,4.55556,4.64497,2.16667,4.60773
description,The JPay App lets you send money and email to ...,Splitwise is the best way to share bills and I...,Access your interactive Experian Credit Report...,Use the Vanguard app to check your accounts an...,Conveniently manage your credit card account f...
id,584959322,458023433,1087101090,335186209,1128712763
is_InAppPurcased,0,0,1,0,0
is_multilingual,0,0,0,0,0
is_multiplatform,0,0,0,0,0
name,JPay,Splitwise - Split bills and expenses the easy way,Experian - Free Credit Report,Vanguard,Credit One Bank Mobile
new_version_desc,This update has a big new feature for the new ...,- Starting today users outside the US can invi...,Bug fixes and other minor updates,See what's new! Have an IRA? Watch your progr...,- Ability add an additional account if qualifi...
num_current_rating,20,18,169,30,724


In [6]:
apps_df.columns

Index([u'category', u'current_rating', u'description', u'id',
       u'is_InAppPurcased', u'is_multilingual', u'is_multiplatform', u'name',
       u'new_version_desc', u'num_current_rating', u'num_overall_rating',
       u'overall_rating', u'price', u'publish_date', u'review1',
       u'review1_star', u'review2', u'review2_star', u'review3',
       u'review3_star', u'scrape_date', u'seller', u'size', u'update_date',
       u'url', u'version'],
      dtype='object')

In [26]:
rating_cleaned = {'1 star':1, "1 and a half stars": 1.5, '2 stars': 2, '2 and a half stars':2.5, "3 stars":3, "3 and a half stars":3.5, "4 stars": 4,
                 '4 and a half stars': 4.5, "5 stars": 5}

In [28]:
apps_df.overall_rating = apps_df.overall_rating.replace(rating_cleaned)

In [30]:
apps_df[apps_df.overall_rating == 5].T

Unnamed: 0,75,83,100,114,151,168,194,267,361,389,...,5297,5512,5519,5532,5545,5560,5569,5592,5601,5654
category,Finance,Food & Drink,Food & Drink,Food & Drink,Food & Drink,Entertainment,Entertainment,Education,Entertainment,Entertainment,...,Weather,Utilities,Utilities,Utilities,Utilities,Utilities,Utilities,Weather,Weather,Utilities
current_rating,4.91324,4.74756,4.33333,5,4.74468,4.83622,4.21298,4.86458,4.08333,4.52899,...,4.33333,4.85675,4.27273,4.48935,4.8994,4.77315,4.73134,4.92378,4.80565,4.54294
description,"With Affirm, you can split almost any online p...",#1 Food & Drink App and Featured as 'Best New ...,"Struggle free, healthy and practical recipes t...",Join thousands of food-lovers spinning with Sp...,600+ NUTRITION MYTHS BUSTED! Have fun while pl...,Do you want to see the best trends of wallpape...,Have you ever wondered that one day even ordin...,ChineseSkill is the most effective app to lear...,BRUH! The Official Bruh Button App. Voice by @...,The best and most complete Vine soundboard on ...,...,6 Reasons why you should try the ATsWeatherToG...,***FREE TODAY (Reg. $0.99)*** Instant ON NO AD...,Turn Every Moment into Impressive Memories wit...,"Fast, bright, easy to use and FREE. The cooles...","Power Clean is a small,fast,free,practical cle...","Download and browse files on your iPhone, iPod...","Fast, bright, and beautiful. Featuring instant...",Smile every time you check the weather! The pu...,MOON is your personal lunar portal. Never be c...,Bright. Fast. Simple. The most elegant and fun...
id,967040652,903911740,980368562,876355608,668562312,1023217192,910447989,777111034,908989995,898524971,...,833872518,379745980,892312330,384365348,1115097063,979401801,379753015,528732981,660036257,381471023
is_InAppPurcased,0,0,1,0,0,1,1,1,1,1,...,0,0,1,0,1,0,1,1,1,0
is_multilingual,0,0,0,0,1,1,1,1,0,0,...,0,0,1,0,1,1,1,0,0,1
is_multiplatform,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
name,Affirm: Split purchases into easy monthly paym...,Forks Over Knives - Healthy Recipes & Easy Meals,Fit Men Cook - Healthy Recipes,Spotluck - Dining Community of Local Restaurants,"Nutrition Quiz: 600+ Facts, Myths & Diet Tips ...",Top Chart of Wallpapers & Themes - Amazing Ima...,Voice Changer App – SoundBoard Effects for Vine,ChineseSkill -Learn Mandarin Chinese Language ...,Bruh-Button,VSounds for Vine Soundboard - Soundboard for V...,...,ATsWeatherToGo,Flashlight ®,Photo & Video Collage Maker with Music,iTorch Flashlight - Led Flash Light for iPhone,"Power Clean - Clean Duplicate Photos, Scan Net...",Browser and File Manager for Documents,Light - LED Flashlight,Weather Puppy,MOON - Current Moon Phase,Flashlight Ⓞ
new_version_desc,More helpful error messages when we detect tha...,Added support for iOS10 Misc bug fixes & adjus...,9 new recipes: - Keto Bison Meatballs - Health...,MESSAGING restaurants may now respond directl...,Your favorite nutrition gaming app is now more...,"- Smart Categories (categories as #tags, ratin...",- Compatible with iOS 10 This release also con...,1. New features added. 2. Bug fixes and perfor...,Updated contest. We are now giving away a dron...,- Minor bug fixes,...,"Access to Road Angel, roadside assistance. Rem...","Dimmable LED Slider (iOS 6, 7, 8, 9 and 10) iP...",-Now you can choose layouts with common ratios...,Now with Apple Watch extension! Open iTorch on...,Our team work hard to build the best app for y...,3rd party SDKs caused broken files are removed...,Bug fixes.,- Facebook share bug fixed! - More accurate ci...,8.1 - Share the current moon with the new iMes...,- Fully compatible with iOS 10 now. - Increase...
num_current_rating,219,103,9,14,94,12433,385,96,156,707,...,6,15707,165,94,517,19048,67,328,247,652


In [31]:
apps_df.to_pickle('app_cleaned.pickle')