# COGS 9 Section 01 Discussion 2: Python Basics continued + Introduction to Pandas

## What we will go over today
* Lists, Dictionaries, Sets
* Introduction to Pandas

# Part 1

### Lists

In [None]:
fruits = ["apple", "banana", "cherry"]
print(fruits)

['apple', 'banana', 'cherry']


In [None]:
fruits[0]

'apple'

In [None]:
fruits[-1]

'cherry'

In [None]:
fruits[0] = "orange"
print(fruits)

['orange', 'banana', 'cherry']


In [None]:
fruits.append("grape")
print(fruits)

['orange', 'banana', 'cherry', 'grape']


In [None]:
fruits[-1]

'grape'

### Dictionaries

In [None]:
fruit_prices = {'apple': 0.75, 'banana': 0.5, 'orange': 0.9}

In [None]:
price = fruit_prices['apple']
print(price)

0.75


In [None]:
price = fruit_prices.get('grape', 0)
print(price)

0


In [None]:
fruit_prices['grape'] = 1.5

In [None]:
del fruit_prices['grape']

In [None]:
fruits = fruit_prices.keys()
prices = fruit_prices.values()
print(fruits)
print(prices)

dict_keys(['apple', 'banana', 'orange'])
dict_values([0.75, 0.5, 0.9])


In [None]:
for fruit in fruit_prices:
    price = fruit_prices[fruit]
    print(fruit, price)

apple 0.75
banana 0.5
orange 0.9


### Sets

In [None]:
my_set = set([1, 2, 3, 3, 4, 4, 4, 5])
print(my_set)

{1, 2, 3, 4, 5}


In [None]:
my_set.add(6)
print(my_set)

{1, 2, 3, 4, 5, 6}


In [None]:
my_set.remove(6)
print(my_set)

{1, 2, 3, 4, 5}


In [None]:
# If you try to remove an element that doesn't exist, you'll get a KeyError. 
# To avoid this, you can use the discard method, which does not raise an error if the element is not found:
my_set.discard(6)

In [None]:
set1 = {1, 2, 3}
set2 = {3, 4, 5}

In [None]:
print(3 in set1)
print(6 in set1)

True
False


# Part 2

## Installing and Using Pandas

In [None]:
import pandas
pandas.__version__

'1.3.5'

In [None]:
# Usually imported with the alias pd
import pandas as pd

## The Pandas Series Object

A Pandas ``Series`` is a one-dimensional array of indexed data.
It can be created from a list or array as follows:

In [None]:
data = pd.Series([0.2, 0.4, 0.6, 0.8, 1.0])
data

0    0.2
1    0.4
2    0.6
3    0.8
4    1.0
dtype: float64

As we see in the output, the ``Series`` wraps both a sequence of values and a sequence of indices, which we can access with the ``values`` and ``index`` attributes.
The ``values`` are simply a familiar NumPy array:

In [None]:
data.values

array([0.2, 0.4, 0.6, 0.8, 1. ])

In [None]:
data.index

RangeIndex(start=0, stop=5, step=1)

In [None]:
price_fruits_dict = {'Orange': 10,
                   'Mango': 23,
                   'Tomato': 8,
                   'Apple': 5,
                   'Cherry': 15}
price_fruits = pd.Series(price_fruits_dict)
price_fruits

Orange    10
Mango     23
Tomato     8
Apple      5
Cherry    15
dtype: int64

By default, a ``Series`` will be created where the index is drawn from the sorted keys.
From here, typical dictionary-style item access can be performed:

In [None]:
price_fruits['Cherry']

15

## The Pandas DataFrame Object

The next fundamental structure in Pandas is the ``DataFrame``.
Like the ``Series`` object discussed in the previous section, the ``DataFrame`` can be thought of either as a generalization of a NumPy array, or as a specialization of a Python dictionary.

#### From a list of dicts

Any list of dictionaries can be made into a ``DataFrame``.
We'll use a simple list comprehension to create some data:

In [None]:
data = [{'a': i, 'b': 2 * i}
        for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


Even if some keys in the dictionary are missing, Pandas will fill them in with ``NaN`` (i.e., "not a number") values:

In [None]:
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


## Diving deep into Pandas with a dataset

### Importing sample data provided in Google Colab
`read_csv()` is the function used to read the csv file as a pandas Dataframe

In [None]:
csv_data = pd.read_csv("./sample_data/california_housing_test.csv")
csv_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.30,34.26,43.0,1510.0,310.0,809.0,277.0,3.5990,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0
...,...,...,...,...,...,...,...,...,...
2995,-119.86,34.42,23.0,1450.0,642.0,1258.0,607.0,1.1790,225000.0
2996,-118.14,34.06,27.0,5257.0,1082.0,3496.0,1036.0,3.3906,237200.0
2997,-119.70,36.30,10.0,956.0,201.0,693.0,220.0,2.2895,62000.0
2998,-117.12,34.10,40.0,96.0,14.0,46.0,14.0,3.2708,162500.0


`head()` allows us to view the top 5 entries of the table

In [None]:
csv_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.3,34.26,43.0,1510.0,310.0,809.0,277.0,3.599,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0


`tail()` allows us to view the bottom 5 entries of the table

In [None]:
csv_data.tail()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
2995,-119.86,34.42,23.0,1450.0,642.0,1258.0,607.0,1.179,225000.0
2996,-118.14,34.06,27.0,5257.0,1082.0,3496.0,1036.0,3.3906,237200.0
2997,-119.7,36.3,10.0,956.0,201.0,693.0,220.0,2.2895,62000.0
2998,-117.12,34.1,40.0,96.0,14.0,46.0,14.0,3.2708,162500.0
2999,-119.63,34.42,42.0,1765.0,263.0,753.0,260.0,8.5608,500001.0


`describe()` allows to view descriptive statistics, including those that summarizes the central tendency, dispersion and shape of a dataset’s distribution

In [None]:
csv_data.describe(include = 'all')

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,-119.5892,35.63539,28.845333,2599.578667,529.950667,1402.798667,489.912,3.807272,205846.275
std,1.994936,2.12967,12.555396,2155.593332,415.654368,1030.543012,365.42271,1.854512,113119.68747
min,-124.18,32.56,1.0,6.0,2.0,5.0,2.0,0.4999,22500.0
25%,-121.81,33.93,18.0,1401.0,291.0,780.0,273.0,2.544,121200.0
50%,-118.485,34.27,29.0,2106.0,437.0,1155.0,409.5,3.48715,177650.0
75%,-118.02,37.69,37.0,3129.0,636.0,1742.75,597.25,4.656475,263975.0
max,-114.49,41.92,52.0,30450.0,5419.0,11935.0,4930.0,15.0001,500001.0


`columns` allows to view all the columns

In [None]:
csv_data.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')

In [None]:
csv_data['median_income']

0       6.6085
1       3.5990
2       5.7934
3       6.1359
4       2.9375
         ...  
2995    1.1790
2996    3.3906
2997    2.2895
2998    3.2708
2999    8.5608
Name: median_income, Length: 3000, dtype: float64

`iloc` is used for indexing based on row numbers

In [None]:
csv_data.iloc[3]

longitude               -118.3600
latitude                  33.8200
housing_median_age        28.0000
total_rooms               67.0000
total_bedrooms            15.0000
population                49.0000
households                11.0000
median_income              6.1359
median_house_value    330000.0000
Name: 3, dtype: float64

In [None]:
csv_data['median_income'].iloc[3]

6.1359

In [None]:
csv_data[['longitude', 'latitude']]

Unnamed: 0,longitude,latitude
0,-122.05,37.37
1,-118.30,34.26
2,-117.81,33.78
3,-118.36,33.82
4,-119.67,36.33
...,...,...
2995,-119.86,34.42
2996,-118.14,34.06
2997,-119.70,36.30
2998,-117.12,34.10


In [None]:
csv_data.iloc[5:10]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
5,-119.56,36.51,37.0,1018.0,213.0,663.0,204.0,1.6635,67000.0
6,-121.43,38.63,43.0,1009.0,225.0,604.0,218.0,1.6641,67000.0
7,-120.65,35.48,19.0,2310.0,471.0,1341.0,441.0,3.225,166900.0
8,-122.84,38.4,15.0,3080.0,617.0,1446.0,599.0,3.6696,194400.0
9,-118.02,34.08,31.0,2402.0,632.0,2830.0,603.0,2.3333,164200.0
