# Statistical Analysis and Machine Learning on NYC Weather Data

In [None]:
import pandas as pd
from matplotlib import pyplot as plt

### Import the data as comma separated value (csv) format

In [None]:
nyc_weather = pd.read_csv('nyc_weather.csv')
nycISI = pd.read_csv('nyc_weather.csv') # to be converted to ISI units
nyc_weather.head() # preview 

In [None]:
nycISI.columns

### Conversion of the units to ISI units in the data frame nycISI

In [None]:
nycISI.rename(columns={'Max.TemperatureF' : 'Max.TemperatureC', 'Mean.TemperatureF' : 'Mean.TemperatureC', 'Min.TemperatureF' : 'Min.TemperatureC'}, inplace = True)
nycISI[['Max.TemperatureC', 'Mean.TemperatureC', 'Min.TemperatureC']] = (nyc_weather[['Max.TemperatureF', 'Mean.TemperatureF', 'Min.TemperatureF']]-32) / 1.8
nycISI.rename(columns={'Max.Dew.PointF' : 'Max.Dew.PointC', 'MeanDew.PointF' : 'Mean.Dew.PointC', 'Min.DewpointF' : 'Min.Dew.PointC'}, inplace = True)
nycISI[['Max.Dew.PointC', 'Mean.Dew.PointC', 'Min.Dew.PointC']] = (nyc_weather[['Max.Dew.PointF', 'MeanDew.PointF', 'Min.DewpointF']]-32) / 1.8
nycISI.rename(columns={'Max.VisibilityMiles':'Max.VisibilityKm', 'Mean.VisibilityMiles' : 'Mean.VisibilityKm', 'Min.VisibilityMiles' : 'Min.VisibilityKm'}, inplace = True)
nycISI[['Max.VisibilityKm', 'Mean.VisibilityKm', 'Min.VisibilityKm']] = (nyc_weather[['Max.VisibilityMiles', 'Mean.VisibilityMiles', 'Min.VisibilityMiles']]) * 1.6
nycISI.rename(columns = {'Max.Wind.SpeedMPH': 'Max.Wind.SpeedKPH', 'Mean.Wind.SpeedMPH': 'Mean.Wind.SpeedKPH'}, inplace = True)
nycISI[['Max.Wind.SpeedKPH', 'Mean.Wind.SpeedKPH']] = nyc_weather[['Max.Wind.SpeedMPH', 'Mean.Wind.SpeedMPH']] * 1.6

# 1. Simple Statistical Analysis

## What is the distribution of temperature and wind speed?

In [None]:
nycISI['Mean.TemperatureC'].describe()

In [None]:
print('The average temperature in NYC from {} to {} is {} C.'.format(nycISI['Date'][0], nycISI['Date'][-1:].values[0], round(nycISI['Mean.TemperatureC'].mean(), 3)))

### Average temperature variation in NYC 

In [None]:
plt.plot(nycISI['Mean.TemperatureC'])
plt.show()

## Average Yearly temperature variation from 1949 to 2014

In [None]:
StartYear = int(nycISI['Date'].values[0][0:4]) + 1 # the +1 is to not include the first year (1948) whose data is only partially available
EndYear = int(nycISI['Date'].values[-1][0:4]) 
nycTemp = pd.DataFrame({'year' : range(StartYear, EndYear + 1), 'Mean Temp' : 0})
nycTemp = nycTemp[['year', 'Mean Temp']]
for k in nycTemp['year']:
    is_year_equal_to_k = []
    for date in nycISI['Date']:
        is_year_equal_to_k.append(date.startswith(str(k)))
    nycTemp['Mean Temp'][k - StartYear] = (nycISI['Mean.TemperatureC'][is_year_equal_to_k]).mean()
    del is_year_equal_to_k[:]
plt.plot(nycTemp['year'], nycTemp['Mean Temp'])
plt.title('Yearly Average Temperature vs. Year')
plt.show()

## Average summer temperature variation from 1949 to 2015

In [None]:
StartYear = int(nycISI['Date'].values[0][0:4]) + 1
EndYear = int(nycISI['Date'].values[-1][0:4]) 
nycTemp = pd.DataFrame({'year' : range(StartYear, EndYear + 1), 'Mean Temp' : 0})
nycTemp = nycTemp[['year', 'Mean Temp']]
for k in nycTemp['year']:
    is_year_equal_to_k = []
    for date in nycISI['Date']:
        is_year_equal_to_k.append(date.startswith(str(k)))
    is_year_equal_to_k = is_year_equal_to_k & (nycISI['season'] == 'Summer')
    nycTemp['Mean Temp'][k - StartYear] = (nycISI['Mean.TemperatureC'][is_year_equal_to_k]).mean()
    is_year_equal_to_k = []
plt.plot(nycTemp['year'], nycTemp['Mean Temp'])
plt.title('Summer Average Temperature vs. Year')
plt.show()

## 1.1 What are the maximums/minimums and when did they happen? 
### Max wind speed ever recorded

In [None]:
max_windSpeed = nycISI['Max.Wind.SpeedKPH'].max()
date_maxWindSpeed = nycISI['Date'][nycISI['Max.Wind.SpeedKPH'] == max_windSpeed].values[0]
print('Maximum wind speed ever recorded in NYC since {} is {} KPH which occured on {}.'.format(nycISI['Date'].min(), round(max_windSpeed, 3), date_maxWindSpeed))

### Max temperature ever recorded

In [None]:
max_temp = nycISI['Max.TemperatureC'].max()
date_maxTemp = nycISI['Date'][nycISI['Max.TemperatureC'] == max_temp].values[0]
print('Maximum temperature ever recorded in NYC since {} is {} C which occured on {}.'.format(nycISI['Date'][0], round(max_temp, 3), date_maxTemp  ))

#### The maximum temperature ever recorded is very recent (> 2011). Is this a sign of climate change?

### Min temperature ever recorded

In [None]:
min_temp = nycISI['Min.TemperatureC'].min()
date_minTemp = nycISI['Date'][nycISI['Min.TemperatureC'] == min_temp].values[0]
print('Minimum temperature ever recorded in NYC since {} is {} C which occured on {}.'.format(nycISI['Date'][0], round(min_temp, 3), date_minTemp))