# Data Science Course Week 1 - Pandas and Data Manipulation

## We will be using real data sources to explore the features of Pandas

MovieLens 100k movie rating data:
    main page: http://grouplens.org/datasets/movielens/
    data dictionary: http://files.grouplens.org/datasets/movielens/ml-100k-README.txt
    files: u.user, u.data, u.item

WHO alcohol consumption data:
    article: http://fivethirtyeight.com/datalab/dear-mona-followup-where-do-people-drink-the-most-beer-wine-and-spirits/    
    original data: https://github.com/fivethirtyeight/data/tree/master/alcohol-consumption
    file: drinks.csv (with additional 'continent' column)

National UFO Reporting Center data:
    main page: http://www.nuforc.org/webreports.html
    file: ufo.csv


In [1]:
import pandas as pd

###Reading Files, Selecting Columns, and Summarizing

In [2]:
# can read a file from local computer or directly from a URL
pd.read_table('u.user')
#pd.read_table('https://raw.githubusercontent.com/ihansel/SYD_DAT_3_labs/Week 1/u.user')

Unnamed: 0,1|24|M|technician|85711
0,2|53|F|other|94043
1,3|23|M|writer|32067
2,4|24|M|technician|43537
3,5|33|F|other|15213
4,6|42|M|executive|98101
5,7|57|M|administrator|91344
6,8|36|M|administrator|05201
7,9|29|M|student|01002
8,10|53|M|lawyer|90703
9,11|39|F|other|30329


In [3]:
# read 'u.user' into 'users'
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_table('u.user', sep='|', header=None, names=user_cols, index_col='user_id', dtype={'zip_code':str})

In [None]:
# examine the users data
#users                   # print the first 30 and last 30 rows
#type(users)             # DataFrame
#users.head()            # print the first 5 rows
#users.head(10)          # print the first 10 rows
#users.tail()            # print the last 5 rows
#users.index             # "the index" (aka "the labels")
#users.columns           # column names (which is "an index")
#users.dtypes            # data types of each column
users.shape             # number of rows and columns
users.values            # underlying numpy array
users.info()            # concise summary (including memory usage)

In [None]:
# select a column
#users['gender']         # select one column
#type(users['gender'])   # Series
users.gender            # select one column using the DataFrame attribute


In [None]:
# summarize (describe) the data
#users.describe()                    # describe all numeric columns
#users.describe(include=['object'])  # describe all object columns (can include multiple types)
#users.describe(include='all')       # describe all columns
#users.gender.describe()             # describe a single column
users.age.mean()                    # only calculate the mean


In [None]:
# count the number of occurrences of each value
#users.occupation.value_counts()     # most useful for categorical variables
users.age.value_counts()        # can also be used with numeric variables


##EXERCISE ONE

In [None]:
# read drinks.csv into a DataFrame called 'drinks'
drinks = pd.read_table('drinks.csv', sep=',')
drinks = pd.read_csv('drinks.csv')              # assumes separator is comma


In [None]:
# print the head and the tail
drinks.head()
drinks.tail()

In [None]:
# examine the default index, data types, and shape
drinks.index
drinks.dtypes
drinks.shape

In [None]:
# print the 'beer_servings' Series
drinks['beer_servings']
drinks.beer_servings


In [None]:
# calculate the average 'beer_servings' for the entire dataset
drinks.describe()                   # summarize all numeric columns
drinks.beer_servings.describe()     # summarize only the 'beer_servings' Series
drinks.beer_servings.mean()         # only calculate the mean


In [None]:

# count the number of occurrences of each 'continent' value and see if it looks correct
drinks.continent.value_counts()


#### Filtering and Sorting

In [None]:
# logical filtering: only show users with age < 20
young_bool = users.age < 20         # create a Series of booleans...
users[young_bool]                   # ...and use that Series to filter rows
users[users.age < 20]               # or, combine into a single step
users[users.age < 20].occupation    # select one column from the filtered results
users[users.age < 20].occupation.value_counts()     # value_counts of resulting Series

In [None]:
# logical filtering with multiple conditions
users[(users.age < 20) & (users.gender=='M')]       # ampersand for AND condition
users[(users.age < 20) | (users.age > 60)]          # pipe for OR condition
users[users.occupation.isin(['doctor', 'lawyer'])]  # alternative to multiple OR conditions


In [None]:
# sorting
users.age.order()                   # sort a column
users.sort('age')                   # sort a DataFrame by a single column
users.sort('age', ascending=False)  # use descending order instead
users.sort(['occupation', 'age'])   # sort by multiple columns


##EXERCISE TWO

In [None]:
# filter DataFrame to only include European countries
drinks[drinks.continent == 'EU']


In [None]:
# filter DataFrame to only include European countries with wine_servings > 300
drinks[(drinks.continent == 'EU') & (drinks.wine_servings > 300)]

In [None]:
# calculate the average 'beer_servings' for all of Europe
drinks[drinks.continent == 'EU'].beer_servings.mean()

In [None]:
# determine which 10 countries have the highest total_litres_of_pure_alcohol
drinks.sort('total_litres_of_pure_alcohol', ascending=False).head(10)


In [None]:
# rename the column 'beer_servings' to 'beer'
#drinks.beer_servings
drinks.rename(columns={'beer_servings':'beer'}, inplace=True)
drinks.beer

In [None]:
# add a new column as a function of existing columns, total_servings = beer + wine + spirits

#drinks.rename(columns={'wine_servings':'wine'}, inplace=True)
#drinks.rename(columns={'spirit_servings':'sprits'}, inplace=True)
#drinks.wine
#drinks.sprits

drinks['total_servings']= drinks.beer + drinks.wine + drinks.sprits
#drinks.total_servings
drinks

In [None]:
# remove the column you just added
#drinks.pop('total_servings')
#drinks.drop('total_servings')
drinks

###Handling Missing Values

In [None]:
# missing values are usually excluded by default
#drinks.continent.value_counts()              # excludes missing values
drinks.continent.value_counts(dropna=False)  # includes missing values

In [None]:
# find missing values in a Series
#drinks.continent.isnull()           # True if missing, False if not missing
#drinks.continent.isnull().sum()     # count the missing values
drinks.continent.notnull()          # True if not missing, False if missing
#drinks[drinks.continent.notnull()]  # only show rows where continent is not missing

In [None]:
# side note: understanding axes
#drinks.sum(axis=0)      # sums "down" the 0 axis (rows)
#drinks.sum()            # axis=0 is the default
drinks.sum(axis=1)      # sums "across" the 1 axis (columns)

In [None]:
# find missing values in a DataFrame
#drinks.isnull()             # DataFrame of booleans
drinks.isnull().sum()       # count the missing values in each column

In [None]:
# fill in missing values
#drinks.continent.fillna(value='NA')                 # fill in missing values with 'NA'
drinks.continent.fillna(value='NA', inplace=True)   # modifies 'drinks' in-place

In [None]:
# turn off the missing value filter
drinks = pd.read_csv('drinks.csv', na_filter=False)

## Merging Data

In [None]:
# read 'u.item' into 'movies'
movie_cols = ['movie_id', 'title']
movies = pd.read_table('u.item', sep='|', header=None, names=movie_cols, usecols=[0, 1])


In [None]:
# read 'u.data' into 'ratings'
rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('u.data', sep='\t', header=None, names=rating_cols)

In [None]:
# merge 'movies' and 'ratings' (inner join on 'movie_id')
movie_ratings = pd.merge(movies, ratings)
movies.shape
#ratings.shape
movie_ratings.shape

## Exercise Four - Movie Lens Data

In [None]:
# for each occupation in 'users', count the number of occurrences
#users.groupby('occupation').count()
users.occupation.value_counts()

In [None]:
# for each occupation, calculate the mean age
users.groupby('occupation').age.mean()

In [None]:
# for each occupation, calculate the minimum and maximum ages
users.groupby('occupation').age.min()
users.groupby('occupation').age.max()
users.groupby('occupation').age.agg(['min', 'max'])

In [None]:
# for each combination of occupation and gender, calculate the mean age
users.groupby(['occupation','gender']).age.mean()

In [164]:
# randomly sample a DataFrame
#train = drinks.sample(frac=0.75, random_state=1)    # will contain 75% of the rows
#test = drinks[drinks.index.isin(train.index)]      # will contain the other 25%

drinks.sample(frac=0.1, random_state=1)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
44,Cyprus,192,154,113,8.2,EU
69,Guatemala,53,69,2,2.2,
162,Sudan,8,13,0,1.7,AF
35,Chile,130,124,172,7.6,SA
183,Tanzania,36,6,1,5.7,AF
11,Bahamas,122,176,51,6.3,
122,Nicaragua,78,118,1,3.5,
81,Ireland,313,118,165,11.4,EU
110,Micronesia,62,50,18,2.3,OC
179,Uganda,45,9,0,8.3,AF


In [163]:
# detect duplicate users
#users.duplicated()          # True if a row is identical to a previous row
users.duplicated().sum()    # count of duplicates
#users[users.duplicated()]   # only show duplicates
#users.drop_duplicates()     # drop duplicate rows
#users.age.duplicated()      # check a single column for duplicates
#users.duplicated(['age', 'gender', 'zip_code']).sum()   # specify columns for finding duplicates


7