# Topics for Tutorial 2

1. Univariate data 
2. Summary statistics
3. Quantiles
4. Histograms
5. Bar charts
6. Encoding categorical varaibles

# Important Python Packages
* Pandas
* Seaborn
* Matplotlib

Join Zoom: https://huji.zoom.us/j/85064392593?pwd=Ymp0bXJRcXpCLzJkM2xHRi9BSHFvZz09

In [None]:
# import necessary packages
import pandas as pd
import seaborn as sns
import matplotlib as plt

### Today's datasets

* City Temperatures -- Daily temperature for different international cities

In [None]:
# read CSV
temp_df = pd.read_csv('city_temp.csv')

In [None]:
# examine the data -- in visual studio code we can also do this another way
temp_df.head()

In [None]:
# which cities do we have data for
temp_df.City.unique()

In [None]:
# isolate data from a single city (e.g., Tel Aviv)
city = 'Tel Aviv'
TA_temp = temp_df[temp_df.City == city]
TA_temp

In [None]:
# get summary statistics for a single city
TA_temp.AvgTemperature.describe()

In [None]:
# eliminate data that doesn't make sense
TA_temp = TA_temp.drop(TA_temp[TA_temp['AvgTemperature']<(-50)].index)

In [None]:
# convert to Celsius
TA_temp['AvgTemp_C'] = (TA_temp['AvgTemperature'] - 32)*(5/9)

In [None]:
# get summary stasitics in Celsius
TA_temp.AvgTemp_C.describe()

In [None]:
# get the mean for the city you chose
mean_temp = TA_temp.AvgTemp_C.mean() 
print(f"The mean temperature in {city} is: {mean_temp:.2f} degrees Celcius")

In [None]:
# get the median temperature for the city you chose
median_temp = TA_temp.AvgTemp_C.median() 
print(f"The median temperature in {city} is: {median_temp:.2f} degrees Celcius")

In [None]:
# get the 10th percentile for the city you chose
percentile_10 = TA_temp.AvgTemp_C.quantile(.1) 
print(f"The tenth percentile in {city} is: {percentile_10:.2f} degrees Celcius")

In [None]:
# get the 90th percentile for the city you chose
percentile_90 = TA_temp.AvgTemp_C.quantile(.9) 
print(f"The ninetieth percentile in {city} is: {percentile_90:.2f} degrees Celcius")

In [None]:
# begin plotting
sns.set_theme(style="whitegrid")

In [None]:
# make a box plot of temperature for the city you chose
ax = sns.boxplot(x=TA_temp.AvgTemp_C)
ax.set(xlabel=f'Average Daily Temperature in {city}')

In [None]:
# compare all the cities
temp_df = temp_df.drop(temp_df[temp_df['AvgTemperature']<(-50)].index)
temp_df['AvgTemp_C'] = (temp_df['AvgTemperature'] - 32)*(5/9)
ax = sns.boxplot(x=temp_df.AvgTemp_C, y = temp_df.City)
ax.set(xlabel='Average Daily Temperature')

In [None]:
# make a violin plot of the temperature
ax = sns.violinplot(x=temp_df.AvgTemp_C, y = temp_df.City)
ax.set(xlabel='Average Daily Temperature')

In [None]:
# make a histogram of the data for the city you chose
ax = sns.histplot(x=TA_temp.AvgTemp_C)
ax.set(xlabel=f'Average Daily Temperature in {city}')

In [None]:
# play around with the bin size for the histogram -- try more bins
ax = sns.histplot(x=TA_temp.AvgTemp_C, bins=100)
ax.set(xlabel='Average Daily Temperature')

In [None]:
# now fewer bins
ax = sns.histplot(x=TA_temp.AvgTemp_C, bins=10)
ax.set(xlabel='Average Daily Temperature')

In [None]:
# add kernel density estimator
ax = sns.histplot(x=TA_temp.AvgTemp_C, bins=20, kde = True)
ax.set(xlabel='Average Daily Temperature')

In [None]:
# how can we normalize the histogram data?
ax = sns.histplot(x=TA_temp.AvgTemp_C, bins=20, kde = True, stat = "density")
ax.set(xlabel='Average Daily Temperature')

In [None]:
# what's the difference between the "density" stat and "probablity" stat
ax = sns.histplot(x=TA_temp.AvgTemp_C, bins=20, kde = True, stat = "probability")
ax.set(xlabel='Average Daily Temperature')

In [None]:
# make a histogram of the data
ax = sns.histplot(x=temp_df.AvgTemp_C, hue=temp_df.City)
ax.set(xlabel='Average Daily Temperature')

In [None]:
# another type of histogram
ax = sns.histplot(x=temp_df.AvgTemp_C, y=temp_df.City, hue = temp_df.City, legend = False)

### Encoding categorical variables

Sometimes, for reasons that will be clear on the HW, we'll want to encode our categorical variables so that they are numbers instead. 

There are many ways that we can achive this. 

Here will learn one, for more examples see: https://pbpython.com/categorical-encoding.html

In [None]:
# what are our cities again?
temp_df.City.unique()

In [None]:
# dictionary for encoding cities (note: we can encode more than one variable at a time)
cleanup_cities = {"City": {"Lilongwe": 1, 
                           "Capetown": 2,
                           "Tel Aviv": 3,
                           "Amman": 4,
                           "Beirut": 5,
                           "Rochester": 6}}

In [None]:
# new dataframe with encoded values
temp_df_encoded = temp_df.replace(cleanup_cities)


In [None]:
# option 2 -- use Pandas

# what are our data types?
temp_df.dtypes


In [None]:
# assign city to be a categorical variable
temp_df["City"] = temp_df["City"].astype('category')


In [None]:
# use codes to encode variable
temp_df["City_encoded"] = temp_df["City"].cat.codes
