# Practice 5: Data Manipulation with Pandas

## Transforming Data

In [2]:
import pandas as pd
import pickle
# Python pickle module is used for serializing and de-serializing a Python object structure. 
# Any object in Python can be pickled so that it can be saved on disk. 
# What pickle does is that it “serializes” the object first before writing it to file. 
# Pickling is a way to convert a python object (list, dict, etc.) into a character stream. 
# The idea is that this character stream contains all the information necessary to reconstruct the object in another python script.
# https://www.geeksforgeeks.org/understanding-python-pickling-example/

In [3]:
with open('homeless_data.pkl', 'rb') as f:
    homelessness = pickle.load(f)
with open('walmart_sales.pkl', 'rb') as f:
    sales = pickle.load(f)

#### Inspecting a DataFrame

In [11]:
################################## TODO ##################################
# Print the head of the homelessness data
homelessness.head()

Unnamed: 0,region,state,individuals,family_members,state_pop
0,East South Central,Alabama,2570.0,864.0,4887681
1,Pacific,Alaska,1434.0,582.0,735139
2,Mountain,Arizona,7259.0,2606.0,7158024
3,West South Central,Arkansas,2280.0,432.0,3009733
4,Pacific,California,109008.0,20964.0,39461588


In [12]:
################################## TODO ##################################
# Print information about homelessness
homelessness.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51 entries, 0 to 50
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   region          51 non-null     object 
 1   state           51 non-null     object 
 2   individuals     51 non-null     float64
 3   family_members  51 non-null     float64
 4   state_pop       51 non-null     int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 2.4+ KB


In [14]:
################################## TODO ##################################
# Print the shape of homelessness
homelessness.shape

(51, 5)

In [15]:
################################## TODO ##################################
# Print a description of homelessness
homelessness.describe()

Unnamed: 0,individuals,family_members,state_pop
count,51.0,51.0,51.0
mean,7225.784314,3504.882353,6405637.0
std,15991.025083,7805.411811,7327258.0
min,434.0,75.0,577601.0
25%,1446.5,592.0,1777414.0
50%,3082.0,1482.0,4461153.0
75%,6781.5,3196.0,7340946.0
max,109008.0,52070.0,39461590.0


In [29]:
# Print the values of homelessness
print(homelessness.values)

[['East South Central' 'Alabama' 2570.0 864.0 4887681]
 ['Pacific' 'Alaska' 1434.0 582.0 735139]
 ['Mountain' 'Arizona' 7259.0 2606.0 7158024]
 ['West South Central' 'Arkansas' 2280.0 432.0 3009733]
 ['Pacific' 'California' 109008.0 20964.0 39461588]
 ['Mountain' 'Colorado' 7607.0 3250.0 5691287]
 ['New England' 'Connecticut' 2280.0 1696.0 3571520]
 ['South Atlantic' 'Delaware' 708.0 374.0 965479]
 ['South Atlantic' 'District of Columbia' 3770.0 3134.0 701547]
 ['South Atlantic' 'Florida' 21443.0 9587.0 21244317]
 ['South Atlantic' 'Georgia' 6943.0 2556.0 10511131]
 ['Pacific' 'Hawaii' 4131.0 2399.0 1420593]
 ['Mountain' 'Idaho' 1297.0 715.0 1750536]
 ['East North Central' 'Illinois' 6752.0 3891.0 12723071]
 ['East North Central' 'Indiana' 3776.0 1482.0 6695497]
 ['West North Central' 'Iowa' 1711.0 1038.0 3148618]
 ['West North Central' 'Kansas' 1443.0 773.0 2911359]
 ['East South Central' 'Kentucky' 2735.0 953.0 4461153]
 ['West South Central' 'Louisiana' 2540.0 519.0 4659690]
 ['New 

In [30]:
# Print the column index of homelessness
print(homelessness.columns)

Index(['region', 'state', 'individuals', 'family_members', 'state_pop'], dtype='object')


In [24]:
################################## TODO ##################################
# Print the row index of homelessness
homelessness.index

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
            34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
            50],
           dtype='int64')

In [34]:
# Sorting rows
################################## TODO ##################################
# Sort homelessness by individual
homelessness_ind = homelessness.sort_values("individuals")

In [35]:
# Print the top few rows
print(homelessness_ind.head())

                region         state  individuals  family_members  state_pop
50            Mountain       Wyoming        434.0           205.0     577601
34  West North Central  North Dakota        467.0            75.0     758080
7       South Atlantic      Delaware        708.0           374.0     965479
39         New England  Rhode Island        747.0           354.0    1058287
45         New England       Vermont        780.0           511.0     624358


In [38]:
################################## TODO ################################## 
# Sort homelessness in the descending order of the number of family members
homelessness_fam = homelessness.sort_values("family_members")

In [39]:
# Print the top few rows
print(homelessness_fam.head())

                region          state  individuals  family_members  state_pop
34  West North Central   North Dakota        467.0            75.0     758080
50            Mountain        Wyoming        434.0           205.0     577601
48      South Atlantic  West Virginia       1021.0           222.0    1804291
41  West North Central   South Dakota        836.0           323.0     878698
24  East South Central    Mississippi       1024.0           328.0    2981020


In [40]:
################################## TODO ##################################
# Sort homelessness by region, then descending family members
homelessness_reg_fam = homelessness.sort_values(["region", "family_members"])

In [41]:
# Print the top few rows
print(homelessness_reg_fam.head())

                region      state  individuals  family_members  state_pop
14  East North Central    Indiana       3776.0          1482.0    6695497
49  East North Central  Wisconsin       2740.0          2167.0    5807406
22  East North Central   Michigan       5209.0          3142.0    9984072
35  East North Central       Ohio       6929.0          3320.0   11676341
13  East North Central   Illinois       6752.0          3891.0   12723071


In [42]:
homelessness_reg_fam

Unnamed: 0,region,state,individuals,family_members,state_pop
14,East North Central,Indiana,3776.0,1482.0,6695497
49,East North Central,Wisconsin,2740.0,2167.0,5807406
22,East North Central,Michigan,5209.0,3142.0,9984072
35,East North Central,Ohio,6929.0,3320.0,11676341
13,East North Central,Illinois,6752.0,3891.0,12723071
24,East South Central,Mississippi,1024.0,328.0,2981020
0,East South Central,Alabama,2570.0,864.0,4887681
17,East South Central,Kentucky,2735.0,953.0,4461153
42,East South Central,Tennessee,6139.0,1744.0,6771631
30,Mid-Atlantic,New Jersey,6048.0,3350.0,8886025


#### Subsetting columns

In [43]:
# Select the individuals column
individuals = homelessness['individuals']

# Print the head of the result
print(individuals.head())

0      2570.0
1      1434.0
2      7259.0
3      2280.0
4    109008.0
Name: individuals, dtype: float64


In [44]:
## different indexing
homelessness.individuals

0       2570.0
1       1434.0
2       7259.0
3       2280.0
4     109008.0
5       7607.0
6       2280.0
7        708.0
8       3770.0
9      21443.0
10      6943.0
11      4131.0
12      1297.0
13      6752.0
14      3776.0
15      1711.0
16      1443.0
17      2735.0
18      2540.0
19      1450.0
20      4914.0
21      6811.0
22      5209.0
23      3993.0
24      1024.0
25      3776.0
26       983.0
27      1745.0
28      7058.0
29       835.0
30      6048.0
31      1949.0
32     39827.0
33      6451.0
34       467.0
35      6929.0
36      2823.0
37     11139.0
38      8163.0
39       747.0
40      3082.0
41       836.0
42      6139.0
43     19199.0
44      1904.0
45       780.0
46      3928.0
47     16424.0
48      1021.0
49      2740.0
50       434.0
Name: individuals, dtype: float64

In [46]:
################################## TODO ##################################
# Select the state and family_members columns
state_fam = homelessness[["state", "family_members"]]

# Print the head of the result
print(state_fam.head())

        state  family_members
0     Alabama           864.0
1      Alaska           582.0
2     Arizona          2606.0
3    Arkansas           432.0
4  California         20964.0


In [47]:
################################## TODO ##################################
# Select only the individuals and state columns, in that order
ind_state = homelessness[["individuals", "state"]]

# Print the head of the result
print(ind_state.head())

   individuals       state
0       2570.0     Alabama
1       1434.0      Alaska
2       7259.0     Arizona
3       2280.0    Arkansas
4     109008.0  California


#### Subsetting rows

In [48]:
# Filter for rows where individuals is greater than 10000
ind_gt_10k = homelessness[homelessness['individuals']>10000]

# See the result
print(ind_gt_10k)

                region       state  individuals  family_members  state_pop
4              Pacific  California     109008.0         20964.0   39461588
9       South Atlantic     Florida      21443.0          9587.0   21244317
32        Mid-Atlantic    New York      39827.0         52070.0   19530351
37             Pacific      Oregon      11139.0          3337.0    4181886
43  West South Central       Texas      19199.0          6111.0   28628666
47             Pacific  Washington      16424.0          5880.0    7523869


In [50]:
################################## TODO ##################################
# Filter for rows where region is Mountain
mountain_reg = homelessness[homelessness["region"]=="Mountain"]

# See the result
print(mountain_reg)

      region       state  individuals  family_members  state_pop
2   Mountain     Arizona       7259.0          2606.0    7158024
5   Mountain    Colorado       7607.0          3250.0    5691287
12  Mountain       Idaho       1297.0           715.0    1750536
26  Mountain     Montana        983.0           422.0    1060665
28  Mountain      Nevada       7058.0           486.0    3027341
31  Mountain  New Mexico       1949.0           602.0    2092741
44  Mountain        Utah       1904.0           972.0    3153550
50  Mountain     Wyoming        434.0           205.0     577601


In [57]:
################################## TODO ##################################
# Filter for rows where family_members is less than 1000 
# and region is Pacific
fam_lt_1k_pac = homelessness[(homelessness["family_members"]<1000) & (homelessness["region"]=="Pacific")]

# See the result
print(fam_lt_1k_pac)

    region   state  individuals  family_members  state_pop
1  Pacific  Alaska       1434.0           582.0     735139


#### Subsetting rows by categorical variables


In [58]:
# Subset for rows in South Atlantic or Mid-Atlantic regions
south_mid_atlantic = homelessness[homelessness['region'].isin(['South Atlantic', 'Mid-Atlantic'])]

# See the result
print(south_mid_atlantic)

            region                 state  individuals  family_members  \
7   South Atlantic              Delaware        708.0           374.0   
8   South Atlantic  District of Columbia       3770.0          3134.0   
9   South Atlantic               Florida      21443.0          9587.0   
10  South Atlantic               Georgia       6943.0          2556.0   
20  South Atlantic              Maryland       4914.0          2230.0   
30    Mid-Atlantic            New Jersey       6048.0          3350.0   
32    Mid-Atlantic              New York      39827.0         52070.0   
33  South Atlantic        North Carolina       6451.0          2817.0   
38    Mid-Atlantic          Pennsylvania       8163.0          5349.0   
40  South Atlantic        South Carolina       3082.0           851.0   
46  South Atlantic              Virginia       3928.0          2047.0   
48  South Atlantic         West Virginia       1021.0           222.0   

    state_pop  
7      965479  
8      701547  
9 

In [61]:
# The Mojave Desert states
canu = ["California", "Arizona", "Nevada", "Utah"]

In [63]:
################################## TODO ##################################
# Filter for rows in the Mojave Desert states
mojave_homelessness = homelessness[homelessness["state"].isin(canu)]

# See the result
print(mojave_homelessness)

      region       state  individuals  family_members  state_pop
2   Mountain     Arizona       7259.0          2606.0    7158024
4    Pacific  California     109008.0         20964.0   39461588
28  Mountain      Nevada       7058.0           486.0    3027341
44  Mountain        Utah       1904.0           972.0    3153550


#### Adding new columns


In [64]:
# Add total col as sum of individuals and family_members
homelessness['total'] = homelessness['individuals'] + homelessness['family_members']

In [65]:
homelessness.total = homelessness.individuals + homelessness.family_members

In [66]:
homelessness.head()

Unnamed: 0,region,state,individuals,family_members,state_pop,total
0,East South Central,Alabama,2570.0,864.0,4887681,3434.0
1,Pacific,Alaska,1434.0,582.0,735139,2016.0
2,Mountain,Arizona,7259.0,2606.0,7158024,9865.0
3,West South Central,Arkansas,2280.0,432.0,3009733,2712.0
4,Pacific,California,109008.0,20964.0,39461588,129972.0


In [12]:
# Drop column "total".
# drop(labels, axis)
# axis{0 or ‘index’, 1 or ‘columns’}, default 0
# Whether to drop labels from the index (0 or ‘index’) or columns (1 or ‘columns’).
homelessness = homelessness.drop('total', 1)

Unnamed: 0,region,state,individuals,family_members,state_pop
0,East South Central,Alabama,2570.0,864.0,4887681
1,Pacific,Alaska,1434.0,582.0,735139
2,Mountain,Arizona,7259.0,2606.0,7158024
3,West South Central,Arkansas,2280.0,432.0,3009733
4,Pacific,California,109008.0,20964.0,39461588


In [67]:
# Add p_individuals col as proportion of individuals
homelessness['p_individuals'] = homelessness['individuals']/homelessness['total']

In [68]:
# See the result
print(homelessness)

                region                 state  individuals  family_members  \
0   East South Central               Alabama       2570.0           864.0   
1              Pacific                Alaska       1434.0           582.0   
2             Mountain               Arizona       7259.0          2606.0   
3   West South Central              Arkansas       2280.0           432.0   
4              Pacific            California     109008.0         20964.0   
5             Mountain              Colorado       7607.0          3250.0   
6          New England           Connecticut       2280.0          1696.0   
7       South Atlantic              Delaware        708.0           374.0   
8       South Atlantic  District of Columbia       3770.0          3134.0   
9       South Atlantic               Florida      21443.0          9587.0   
10      South Atlantic               Georgia       6943.0          2556.0   
11             Pacific                Hawaii       4131.0          2399.0   

In [70]:
## Wrapping up
# Create indiv_per_10k col as homeless individuals per 10k state pop
homelessness["indiv_per_10k"] = 10000 * homelessness['individuals'] / homelessness['state_pop'] 

In [71]:
################################## TODO ##################################
# Subset rows for indiv_per_10k greater than 20
high_homelessness = homelessness[homelessness["indiv_per_10k"]>20]

In [76]:
################################## TODO ##################################
# Sort high_homelessness by descending indiv_per_10k
high_homelessness_srt = homelessness.sort_values(["indiv_per_10k"], ascending = False)

In [79]:
################################## TODO ##################################
# From high_homelessness_srt, select the state and indiv_per_10k cols
result = high_homelessness_srt[["state", "indiv_per_10k"]]

# See the result
print(result)

                   state  indiv_per_10k
8   District of Columbia      53.738381
11                Hawaii      29.079406
4             California      27.623825
37                Oregon      26.636307
28                Nevada      23.314189
47            Washington      21.829195
32              New York      20.392363
1                 Alaska      19.506515
5               Colorado      13.366045
45               Vermont      12.492833
19                 Maine      10.828516
2                Arizona      10.141067
9                Florida      10.093523
21         Massachusetts       9.895919
41          South Dakota       9.514077
31            New Mexico       9.313145
26               Montana       9.267771
42             Tennessee       9.065763
27              Nebraska       9.062045
20              Maryland       8.141420
3               Arkansas       7.575423
50               Wyoming       7.513837
12                 Idaho       7.409159
7               Delaware       7.333148


## Aggregating Data

In [80]:
# Print the head of the sales DataFrame
print(sales.head())

   store type  department       date  weekly_sales  is_holiday  temperature_c  \
0      1    A           1 2010-02-05      24924.50       False       5.727778   
1      1    A           2 2010-02-05      50605.27       False       5.727778   
2      1    A           3 2010-02-05      13740.12       False       5.727778   
3      1    A           4 2010-02-05      39954.04       False       5.727778   
4      1    A           5 2010-02-05      32229.38       False       5.727778   

   fuel_price_usd_per_l  unemployment  
0              0.679451         8.106  
1              0.679451         8.106  
2              0.679451         8.106  
3              0.679451         8.106  
4              0.679451         8.106  


In [81]:
# Print the info about the sales DataFrame
print(sales.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 413119 entries, 0 to 413118
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   store                 413119 non-null  int64         
 1   type                  413119 non-null  object        
 2   department            413119 non-null  int32         
 3   date                  413119 non-null  datetime64[ns]
 4   weekly_sales          413119 non-null  float64       
 5   is_holiday            413119 non-null  bool          
 6   temperature_c         413119 non-null  float64       
 7   fuel_price_usd_per_l  413119 non-null  float64       
 8   unemployment          413119 non-null  float64       
dtypes: bool(1), datetime64[ns](1), float64(4), int32(1), int64(1), object(1)
memory usage: 27.2+ MB
None


In [82]:
# Print the mean of weekly_sales
print(sales['weekly_sales'].mean())

16094.726811185497


In [83]:
################################## TODO ##################################
# Print the median of weekly_sales
print(sales["weekly_sales"].median())

7682.47


In [84]:
# Print the maximum of the date column
print(sales['date'].max())

2012-10-26 00:00:00


In [85]:
# Print the minimum of the date column
print(sales['date'].min())

2010-02-05 00:00:00


In [25]:
## Efficient summaries : usage of .agg() method
def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)

In [26]:
# Print IQR of the temperature_c column
print(sales['temperature_c'].agg(iqr))

15.299999999999994


#### Dropping duplicates and Counting categorical variables


In [27]:
# Drop duplicate store/type combinations
store_types = sales.drop_duplicates(subset=["store", "type"])
print(store_types.head())

       store type  department       date  weekly_sales  is_holiday  \
0          1    A           1 2010-02-05      24924.50       False   
10244      2    A           1 2010-02-05      35034.06       False   
20482      3    B           1 2010-02-05       6453.58       False   
29518      4    A           1 2010-02-05      38724.42       False   
39790      5    B           1 2010-02-05       9323.89       False   

       temperature_c  fuel_price_usd_per_l  unemployment  
0           5.727778              0.679451         8.106  
10244       4.550000              0.679451         8.324  
20482       7.616667              0.679451         7.368  
29518       6.533333              0.686319         8.623  
39790       4.277778              0.679451         6.566  


In [28]:
# Count the number of stores of each type
store_counts = store_types["type"].value_counts()
print(store_counts)

A    22
B    17
C     6
Name: type, dtype: int64


In [None]:
################################## TODO ##################################
# Get the proportion of stores of each type
# store_props = ?
print(store_props)

In [33]:
# Drop duplicate store/department combinations
store_depts = sales.drop_duplicates(subset=["store", "department"])
print(store_depts.head())


   store type  department       date  weekly_sales  is_holiday  temperature_c  \
0      1    A           1 2010-02-05      24924.50       False       5.727778   
1      1    A           2 2010-02-05      50605.27       False       5.727778   
2      1    A           3 2010-02-05      13740.12       False       5.727778   
3      1    A           4 2010-02-05      39954.04       False       5.727778   
4      1    A           5 2010-02-05      32229.38       False       5.727778   

   fuel_price_usd_per_l  unemployment  
0              0.679451         8.106  
1              0.679451         8.106  
2              0.679451         8.106  
3              0.679451         8.106  
4              0.679451         8.106  


In [34]:
# Count the number of each department number and sort
dept_counts_sorted = store_depts["department"].value_counts(sort=True)
print(dept_counts_sorted)

2     45
4     45
9     45
7     45
5     45
      ..
37    20
50    14
43     5
39     5
65     1
Name: department, Length: 81, dtype: int64


In [None]:
################################## TODO ##################################
# Get the proportion of departments of each number and sort
# dept_props_sorted = ?

print(dept_props_sorted)

In [None]:
# Subset the rows that are holiday weeks and drop duplicate dates
holiday_dates = None

In [None]:
# Print date col of holiday_dates
print(holiday_dates["date"])

#### What percent of sales occurred at each store type?

In [36]:
# Calculate total weekly sales
sales_all = sales["weekly_sales"].sum()

In [39]:
# Subset for type A stores, calculate total weekly sales
sales_A = sales[sales["type"] == "A"]["weekly_sales"].sum()
sales_A

4331014722.749999

In [38]:
sales[sales["type"] == "A"].weekly_sales.sum()

4331014722.749999

In [40]:
# Subset for type B stores, calculate total weekly sales
sales_B = sales[sales["type"] == "B"]["weekly_sales"].sum()

In [41]:
# Subset for type C stores, calculate total weekly sales
sales_C = sales[sales["type"] == "C"]["weekly_sales"].sum()

In [42]:
# Get proportion for each type
sales_propn_by_type = [sales_A, sales_B, sales_C] / sales_all
print(sales_propn_by_type)

[0.65137469 0.28763851 0.0609868 ]


In [44]:
## Better solution using .groupby()
# Group by type; calculate total weekly sales
sales_by_type = sales.groupby("type")["weekly_sales"].sum()
sales_by_type

type
A    4.331015e+09
B    1.912519e+09
C    4.055035e+08
Name: weekly_sales, dtype: float64

In [45]:
################################## TODO ##################################
# Get proportion for each type
# sales_propn_by_type = ?

print(sales_propn_by_type)

type
A    0.651375
B    0.287639
C    0.060987
Name: weekly_sales, dtype: float64


In [None]:
# From previous step
sales_by_type = sales.groupby("type")["weekly_sales"].sum()

In [48]:
################################## TODO ##################################
# Group by type and is_holiday; calculate total weekly sales
# sales_propn_by_type_is_holiday = ?

print(sales_propn_by_type_is_holiday)

type  is_holiday
A     False         4.007612e+09
      True          3.234028e+08
B     False         1.765411e+09
      True          1.471081e+08
C     False         3.772478e+08
      True          2.825570e+07
Name: weekly_sales, dtype: float64


In [49]:
# Import NumPy with the alias np
import numpy as np

In [50]:
# For each store type, aggregate weekly_sales: get min, max, mean, and median
sales_stats = sales.groupby("type")["weekly_sales"].agg([min, max, np.mean, np.median])

# Print sales_stats
print(sales_stats)

          min        max          mean    median
type                                            
A    -4988.94  474330.10  20099.568043  10105.17
B    -3924.00  693099.36  12335.331875   6269.02
C     -379.00  112152.35   9519.532538   1149.67


In [51]:
################################## TODO ##################################
# For each store type, aggregate unemployment and fuel_price_usd_per_l: get min, max, mean, and median
# Use groupby
# unemp_fuel_stats = ?

# Print unemp_fuel_stats
print(unemp_fuel_stats)

     unemployment                          fuel_price_usd_per_l            \
              min     max      mean median                  min       max   
type                                                                        
A           3.879  14.313  7.791595  7.818             0.653034  1.180321   
B           4.125  14.313  7.889666  7.806             0.664129  1.180321   
C           5.217  14.313  8.934350  8.300             0.664129  1.180321   

                          
          mean    median  
type                      
A     0.883391  0.902676  
B     0.892997  0.922225  
C     0.888848  0.902676  


In [None]:
# For each store type, aggregate weekly_sales: get min, max, mean, and median
sales_stats = sales.groupby("type")["weekly_sales"].agg([np.min, np.max, np.mean, np.median])
sales.pivot_table(values='weekly_sales', index='type', aggfunc=[np.min, np.max, np.mean, np.median], fill_value = 0, margins=True)

# Print sales_stats
print(sales_stats)

In [None]:
# For each store type, aggregate unemployment and fuel_price_usd_per_l: get min, max, mean, and median
unemp_fuel_stats = sales.groupby("type")[["unemployment", "fuel_price_usd_per_l"]].agg([np.min, np.max, np.mean, np.median])
# Print unemp_fuel_stats
print(unemp_fuel_stats)

################################## TODO ##################################
# Create a pivot table with the same results


In [None]:
# For each pair of store type and is_holiday, aggregate unemployment and fuel_price_usd_per_l: get min, max, mean, and median
################################## TODO ##################################
# Use group by
# unemp_fuel_stats_is_holyday = ?
print(unemp_fuel_stats_is_holyday)
################################## TODO ##################################
# Use pivot table
# pt_unemp_fuel_stats_is_holyday = ?
print(pt_unemp_fuel_stats_is_holyday)

## Slicing and Indexing

In [52]:
## https://www.kaggle.com/sudalairajkumar/daily-temperature-of-major-cities

temperatures= pd.read_csv('temperature.csv')
temperatures.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0.1,Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
0,0,Africa,Algeria,,Algiers,1,1,1995,64.2
1,10,Africa,Algeria,,Algiers,1,11,1995,51.7
2,20,Africa,Algeria,,Algiers,1,21,1995,54.2
3,30,Africa,Algeria,,Algiers,1,31,1995,56.9
4,40,Africa,Algeria,,Algiers,2,10,1995,59.0


In [53]:
# Index temperatures by city
temperatures_ind = temperatures.set_index('City')

# Look at temperatures_ind
print(temperatures_ind)


                      Unnamed: 0         Region  Country  \
City                                                       
Algiers                        0         Africa  Algeria   
Algiers                       10         Africa  Algeria   
Algiers                       20         Africa  Algeria   
Algiers                       30         Africa  Algeria   
Algiers                       40         Africa  Algeria   
...                          ...            ...      ...   
San Juan Puerto Rico     2906280  North America       US   
San Juan Puerto Rico     2906290  North America       US   
San Juan Puerto Rico     2906300  North America       US   
San Juan Puerto Rico     2906310  North America       US   
San Juan Puerto Rico     2906320  North America       US   

                                       State  Month  Day  Year  AvgTemperature  
City                                                                            
Algiers                                  NaN      1    1 

In [None]:
################################## TODO ##################################
# Reset the index, keeping its contents


In [None]:
################################## TODO ##################################
# Reset the index, dropping its contents


In [None]:
## Subsetting with .loc[]
# Make a list of cities, "Seoul","Chicago", "Osaka"
cities = ["Seoul","Chicago", "Osaka"]

In [None]:
# Subset temperatures using square brackets
print(temperatures[temperatures["City"].isin(cities)])

In [None]:
# Subset temperatures_ind using .loc[]
print(temperatures_ind.loc[cities])

In [None]:
temperatures_ind.index

In [None]:
## Subsetting with .loc[]
# Make a list of cities, Seoul, Chicago, Osaka
cities = ["Seoul", "Chicago", "Osaka"]

In [None]:
################################## TODO ##################################
# Subset temperatures only for cities using square brackets


In [None]:
################################## TODO ##################################
# Subset temperatures_ind using .loc[]


In [None]:
################################## TODO ##################################
# Index temperatures by country & city
# temperatures_ind2 = ?


In [None]:
# List of tuples: Brazil, Rio De Janeiro & Pakistan, Lahore
rows_to_keep = [('US', 'Los Angeles'),('Japan','Osaka')]

In [None]:
# Subset for rows to keep
print(temperatures_ind.loc[rows_to_keep])

In [None]:
# Sort temperatures_ind by index values
print(temperatures_ind.sort_index())

In [None]:
# Sort temperatures_ind by index values at the city level
print(temperatures_ind.sort_index(level="City"))

In [None]:
# Sort temperatures_ind by country then descending city
print(temperatures_ind.sort_index(level=["Country", "City"], ascending = [True, False]))

In [None]:
## Slicing index values
# Sort the index of temperatures_ind
temperatures_srt = temperatures_ind.sort_index()

In [None]:
# Subset rows from Pakistan to Russia
print(temperatures_srt.loc['Pakistan':'Russia'])

In [None]:
################################## TODO ##################################
# Try to subset rows from Lahore to Moscow


In [None]:
# Subset rows from Pakistan, Lahore to Russia, Moscow
print(temperatures_srt.loc[('Pakistan','Lahore'):('Russia','Moscow')])

In [None]:
## Slicing in both directions
################################## TODO ##################################
# Subset rows from India, Hyderabad to Iraq, Baghdad


In [None]:
# Subset columns from date to avg_temp_c
print(temperatures_srt.loc[:, 'Year':'AvgTemperature'])

In [None]:
################################## TODO ##################################
# Subset in both directions at once


#### Slicing time series


In [None]:
# Add a colum to temperature named date in format of yy-mm-dd
temperatures['date'] = pd.to_datetime(temperatures[['Year','Month','Day']][:10000])


In [None]:
# Use Boolean conditions to subset temperatures for rows in 2010 and 2011
temperatures_bool = temperatures[(temperatures['date'] >= '2010-01-01') & (temperatures['date'] <= '2011-12-31')]
print(temperatures_bool)


In [None]:
# TODO. Set date as an index
temperatures_ind = None

In [None]:
# Use .loc[] to subset temperatures_ind for rows from Aug 2010 to Feb 2011
print(temperatures_ind.loc['2010-08-01':'2011-02-28'])

In [None]:
################################## TODO ##################################
# Use .loc[] to subset temperatures_ind for rows in 2010 and 2011


In [None]:
temp_by_country_city_vs_year = temperatures.pivot_table(values='AvgTemperature', 
                                                        index=['Country', 'City'], 
                                                        columns = 'Year',fill_value=0)

# See the result
print(temp_by_country_city_vs_year)

In [None]:
# Subset for Egypt to India
temp_by_country_city_vs_year.loc['Egypt':'India']

In [None]:
################################## TODO ##################################
# Subset for Egypt, Cairo to India, Delhi


In [None]:
# Subset in both directions at once
temp_by_country_city_vs_year.loc[('Egypt','Cairo'):('India','Delhi'),'2005':'2010']


In [None]:
# Get the worldwide mean temp by year
mean_temp_by_year = temp_by_country_city_vs_year.mean()

In [None]:
# Filter for the year that had the highest mean temp
print(mean_temp_by_year[mean_temp_by_year==mean_temp_by_year.max()])

In [None]:
################################## TODO ##################################
# Get the mean temp by city
# mean_temp_by_city = ?

In [None]:
################################## TODO ##################################
# Find the city that had the lowest mean temp


## Visualization

In [None]:
## Which avocado size is most popular?
with open('avoplotto.pkl', 'rb') as f:
    avocados = pickle.load(f)

In [None]:
# Look at the first few rows of data
avocados.head()

In [None]:
# Import matplotlib.pyplot with alias plt
import matplotlib.pyplot as plt


In [None]:
# Look at the first few rows of data
print(avocados.head())

In [None]:
# TODO. Get the total number of avocados sold of each size
nb_sold_by_size = None

In [None]:
# Create a bar plot of the number of avocados sold by size
nb_sold_by_size.plot(kind='bar')

# Show the plot
plt.show()

In [None]:
## Changes in sales over time

# Import matplotlib.pyplot with alias plt
import matplotlib.pyplot as plt

################################## TODO ##################################
# Get the total number of avocados sold on each date
# nb_sold_by_date = ?


In [None]:
# Create a line plot of the number of avocados sold by date
nb_sold_by_date.plot(x='date')

# Show the plot
plt.show()

In [None]:
# Scatter plot of nb_sold vs avg_price with title
avocados.plot(x='nb_sold', y='avg_price', kind='scatter', title="Number of avocados sold vs. average price")

# Show the plot
plt.show()

In [None]:
# Histogram of conventional avg_price 
avocados[avocados["type"] == "conventional"]["avg_price"].hist()

# Histogram of organic avg_price
avocados[avocados["type"] == "organic"]["avg_price"].hist(alpha=0.5, bins=20)

# Add a legend
plt.legend(["conventional", "organic"])

# Show the plot
plt.show()

In [None]:
avocados.head()

In [None]:
## Finding missing values

# Import matplotlib.pyplot with alias plt
import matplotlib.pyplot as plt

# Check individual values for missing values
avocados_2015 = avocados[avocados['year']==2015]
print(avocados_2015.isna().sum())

In [None]:
# Check each column for missing values
print(avocados_2015.isna().any())

In [None]:
# Bar plot of missing values by variable
avocados_2015.isna().sum().plot(kind='bar')

# Show plot
plt.show()

## Creating DataFrames


In [None]:
# Create a list of dictionaries with new data
avocados_list = [
    {'date': "2019-11-03", 'small_sold': 10376832, 'large_sold':7835071},
    {'date': "2019-11-10", 'small_sold': 10717154, 'large_sold':8561348},
]

################################## TODO ##################################
# Convert list into DataFrame
# avocados_2019 = ?
##########################################################################

# Print the new DataFrame
print(avocados_2019)

In [None]:
# Create a dictionary of lists with new data
avocados_dict = {
  "date": ['2019-11-17','2019-12-01'],
  "small_sold": [10859987, 9291631],
  "large_sold": [7674135, 6238096]
}

################################## TODO ##################################
# Convert dictionary into DataFrame
# avocados_2019 = ?
##########################################################################

# Print the new DataFrame
print(avocados_2019)