## Pandas for Exploratory Data Analysis

MovieLens 100k movie rating data:  
* main page: http://grouplens.org/datasets/movielens/  
* data dictionary: http://files.grouplens.org/datasets/movielens/ml-100k-README.txt  
* files: u.user, u.data, u.item

WHO alcohol consumption data:  
* article: http://fivethirtyeight.com/datalab/dear-mona-followup-where-do-people-drink-the-most-beer-wine-and-spirits/    
* original data: https://github.com/fivethirtyeight/data/tree/master/alcohol-consumption  
* file: drinks.csv (with additional 'continent' column)

National UFO Reporting Center data:  
* main page: http://www.nuforc.org/webreports.html  
* file: ufo.csv

In [None]:
import pandas as pd

### Reading Files, Selecting Columns, and Summarizing

In [None]:
# can read a file from local computer or directly from a URL
user_data_url = r'https://raw.githubusercontent.com/justmarkham/DAT7/master/data/u.user'


In [None]:
# read 'u.user' into 'users'
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']

In [None]:
users = pd.read_table(user_data_url)


In [None]:
users = pd.read_table(user_data_url, sep='|', names=user_cols)

In [None]:
                   # print the first 30 and last 30 rows

### examine the users data

In [None]:
# What type? DataFrame

In [None]:
# print the first 5 rows

In [None]:
# print the first 10 rows

In [None]:
# print the last 5 rows

In [None]:
# "the index" (aka "the labels")

In [None]:
# column names (which is "an index")

In [None]:
# data types of each column

In [None]:
# number of rows and columns

In [None]:
# underlying numpy array

In [None]:
# concise summary (including memory usage)

#### select a column

In [None]:
# select one column

In [None]:
# Series

In [None]:
# select one column using the DataFrame attribute

### summarize (describe) the data

In [None]:
# describe all numeric columns

In [None]:
# describe all object columns (can include multiple types)

In [None]:
# describe all columns

In [None]:
# describe a single column

In [None]:
# only calculate the mean

### count the number of occurrences of each value

In [None]:
# most useful for categorical variables

In [None]:
# can also be used with numeric variables

# EXERCISE ONE

In [None]:
# read drinks.csv into a DataFrame called 'drinks'
drinks_data_url = r'https://raw.githubusercontent.com/justmarkham/DAT7/master/data/drinks.csv'


In [None]:
# print the head and the tail


In [None]:
# examine the default index, data types, and shape


In [None]:
# print the 'beer_servings' Series


In [None]:
# calculate the average 'beer_servings' for the entire dataset


In [None]:
# count the number of occurrences of each 'continent' value and see if it looks correct


## Filtering and Sorting

### logical filtering: only show users with age < 20

In [None]:
# create a Series of booleans...

In [None]:
# ...and use that Series to filter rows

In [None]:
# or, combine into a single step

In [None]:
# select one column from the filtered results

In [None]:
# value_counts of resulting Series

### logical filtering with multiple conditions

In [None]:
# ampersand for AND condition

In [None]:
# pipe for OR condition

In [None]:
 # alternative to multiple OR conditions

### sorting

In [None]:
# sort a DataFrame by a single column

In [None]:
# use descending order instead

In [None]:
# sort by multiple columns

# EXERCISE TWO

In [None]:
# filter DataFrame to only include European countries


In [None]:
# filter DataFrame to only include European countries with wine_servings > 300


In [None]:
# calculate the average 'beer_servings' for all of Europe


In [None]:
# determine which 10 countries have the highest total_litres_of_pure_alcohol


### Renaming, Adding, and Removing Columns

In [None]:
# renaming one or more columns


In [None]:
# different way to do same

In [None]:
# replace all column names


In [None]:
# replace during file reading

In [None]:
# replace after file reading

In [None]:
# add a new column as a function of existing columns


### removing columns

In [None]:
# Drop a column. axis=0 for rows, 1 for columns

In [None]:
# drop multiple columns

In [None]:
# make it permanent

### Handling Missing Values

#### missing values are usually excluded by default

In [None]:
 # excludes missing values

In [None]:
# includes missing values

#### find missing values in a Series

In [None]:
# True if missing, False if not missing

In [None]:
# count the missing values

In [None]:
 # True if not missing, False if missing

In [None]:
# only show rows where continent is not missing

#### side note: understanding axes

In [None]:
 # sums "down" the 0 axis (rows)

In [None]:
 # axis=0 is the default

In [None]:
 # sums "across" the 1 axis (columns)

#### find missing values in a DataFrame

In [None]:
 # DataFrame of booleans

In [None]:
 # count the missing values in each column

#### drop missing values

In [None]:
# drop a row if ANY values are missing

In [None]:
 # drop a row only if ALL values are missing

#### fill in missing values

In [None]:
# fill in missing values with 'NA'

In [None]:
 # modifies 'drinks' in-place

In [None]:
# turn off the missing value filter


# EXERCISE THREE

In [None]:
# read ufo.csv into a DataFrame called 'ufo'
ufo_data_url = r'https://raw.githubusercontent.com/justmarkham/DAT7/master/data/ufo.csv'


In [None]:
# check the shape of the DataFrame


In [None]:
# what are the three most common colors reported?


In [None]:
# rename any columns with spaces so that they don't contain spaces


In [None]:
# for reports in VA, what's the most common city?


In [None]:
# print a DataFrame containing only reports from Arlington, VA


In [None]:
# count the number of missing values in each column


In [None]:
# how many rows remain if you drop all rows with any missing values?


### Split-Apply-Combine
![Split-Apply-Combine diagram](http://i.imgur.com/yjNkiwL.png)

In [None]:
# for each continent, calculate the mean beer servings


In [None]:
# for each continent, calculate the mean of all numeric columns


In [None]:
# for each continent, describe beer servings


In [None]:
# similar, but outputs a DataFrame and can be customized


In [None]:
# for each continent, describe all numeric columns


In [None]:
# for each continent, count the number of occurrences


# EXERCISE FOUR

In [None]:
# for each occupation in 'users', count the number of occurrences
 # sorted by counts

In [None]:
 # sorted by alphabetically, occupation

In [None]:
# for each occupation, calculate the mean age


In [None]:
# for each occupation, calculate the minimum and maximum ages


In [None]:
# for each combination of occupation and gender, calculate the mean age


### Selecting Multiple Columns and Filtering Rows

#### select multiple columns

In [None]:
# create a list of column names...

In [None]:
# ...and use that list to select columns

In [None]:
 # or, combine into a single step

#### use loc to select columns by name

In [None]:
 # colon means "all rows", then select one column

In [None]:
 # select two columns

In [None]:
 # select a range of columns

#### loc can also filter rows by "name" (the index)

In [None]:
 # row 0, all columns

In [None]:
 # rows 0/1/2, all columns

In [None]:
  # rows 0/1/2, range of columns

#### use iloc to filter rows and select columns by integer position

In [None]:
  # all rows, columns in position 0/3

In [None]:
  # all rows, columns in position 0/1/2/3

In [None]:
  # rows in position 0/1/2, all columns

### Joining (Merging) DataFrames

In [None]:
# Ignore this cell - old code!
# read 'u.item' into 'movies'
# movie_cols = ['movie_id', 'title']
# u_item = r'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.item'
# movies = pd.read_table(u_item, sep='|', header=None, names=movie_cols, usecols=[0, 1])
# movies.head()

In [None]:
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('http://bit.ly/imdbratings', usecols=['title', 'genre', 'duration', 'actors_list'])


In [None]:
# read 'u.data' into 'ratings'
ratings = pd.read_csv('http://bit.ly/imdbratings', usecols=['star_rating', 'title', 'content_rating'])


In [None]:
# merge 'movies' and 'ratings' (inner join on 'movie_id')
movie_ratings = pd.merge(movies, ratings)


In [None]:
movies.shape


In [None]:
ratings.shape


In [None]:
movie_ratings.shape

### Other Commonly Used Features

In [None]:
# map existing values to a different set of values
users['is_male'] = users.gender.map({'F':0, 'M':1})


In [None]:
# replace all instances of a value in a column (must match entire value)
ufo.State.replace('Fl', 'FL', inplace=True)


In [None]:
# string methods are accessed via 'str'
ufo.State.str.upper()                               # converts to uppercase
ufo.Colors_Reported.str.contains('RED', na='False') # checks for a substring


In [None]:
# convert a string to the datetime format
ufo['Time'] = pd.to_datetime(ufo.Time)
ufo.Time.dt.hour                        # datetime format exposes convenient attributes
(ufo.Time.max() - ufo.Time.min()).days  # also allows you to do datetime "math"


In [None]:
# setting and then removing an index
ufo.set_index('Time', inplace=True)
ufo.reset_index(inplace=True)


In [None]:
# change the data type of a column
drinks['beer'] = drinks.beer.astype('float')

In [None]:
# create dummy variables for 'continent' and exclude first dummy column
continent_dummies = pd.get_dummies(drinks.continent, prefix='cont').iloc[:, 1:]


In [None]:
# concatenate two DataFrames (axis=0 for rows, axis=1 for columns)
drinks = pd.concat([drinks, continent_dummies], axis=1)


### Other Less Used Features

#### detecting duplicate rows


In [None]:
users.duplicated()          # True if a row is identical to a previous row
users.duplicated().sum()    # count of duplicates
users[users.duplicated()]   # only show duplicates
users.drop_duplicates()     # drop duplicate rows
users.age.duplicated()      # check a single column for duplicates
users.duplicated(['age', 'gender', 'zip_code']).sum()   # specify columns for finding duplicates


#### convert a range of values into descriptive groups

In [None]:
drinks['beer_level'] = 'low'    # initially set all values to 'low'
drinks.loc[drinks.beer.between(101, 200), 'beer_level'] = 'med'     # change 101-200 to 'med'
drinks.loc[drinks.beer.between(201, 400), 'beer_level'] = 'high'    # change 201-400 to 'high'


In [None]:
# display a cross-tabulation of two Series
pd.crosstab(drinks.continent, drinks.beer_level)


In [None]:
# convert 'beer_level' into the 'category' data type
drinks['beer_level'] = pd.Categorical(drinks.beer_level, categories=['low', 'med', 'high'])
drinks.sort_values('beer_level')   # sorts by the categorical ordering (low to high)


In [None]:
# limit which rows are read when reading in a file
pd.read_csv(drinks_data_url, nrows=10)           # only read first 10 rows
pd.read_csv(drinks_data_url, skiprows=[1, 2])    # skip the first two rows of data


In [None]:
# write a DataFrame out to a CSV
drinks.to_csv('drinks_updated.csv')                 # index is used as first column
drinks.to_csv('drinks_updated.csv', index=False)    # ignore index


In [None]:
# create a DataFrame from a dictionary
pd.DataFrame({'capital':['Montgomery', 'Juneau', 'Phoenix'], 'state':['AL', 'AK', 'AZ']})


In [None]:
# create a DataFrame from a list of lists
pd.DataFrame([['Montgomery', 'AL'], ['Juneau', 'AK'], ['Phoenix', 'AZ']], columns=['capital', 'state'])


In [None]:
# randomly sample a DataFrame
import numpy as np
mask = np.random.rand(len(drinks)) < 0.66   # create a Series of booleans
train = drinks[mask]                        # will contain around 66% of the rows
test = drinks[~mask]                        # will contain the remaining rows


In [None]:
# change the maximum number of rows and columns printed ('None' means unlimited)
pd.set_option('max_rows', None)     # default is 60 rows
pd.set_option('max_columns', None)  # default is 20 columns
print (drinks)

In [None]:
# reset options to defaults
pd.reset_option('max_rows')
pd.reset_option('max_columns')

In [None]:
# change the options temporarily (settings are restored when you exit the 'with' block)
with pd.option_context('max_rows', None, 'max_columns', None):
    print (drinks)