# Statistical Analysis and Machine Learning on NYC Weather Data

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

### Import the data as comma separated value (csv) format

In [None]:
nyc_weather = pd.read_csv('nyc_weather.csv')
nycISI = pd.read_csv('nyc_weather.csv') # to be converted to ISI units
nyc_weather.head() # preview 

In [None]:
nycISI.columns

### Conversion of the units to ISI units in the data frame nycISI

In [None]:
nycISI.rename(columns={'Max.TemperatureF' : 'Max.TemperatureC', 'Mean.TemperatureF' : 'Mean.TemperatureC', 'Min.TemperatureF' : 'Min.TemperatureC'}, inplace = True)
nycISI[['Max.TemperatureC', 'Mean.TemperatureC', 'Min.TemperatureC']] = (nyc_weather[['Max.TemperatureF', 'Mean.TemperatureF', 'Min.TemperatureF']]-32) / 1.8
nycISI.rename(columns={'Max.Dew.PointF' : 'Max.Dew.PointC', 'MeanDew.PointF' : 'Mean.Dew.PointC', 'Min.DewpointF' : 'Min.Dew.PointC'}, inplace = True)
nycISI[['Max.Dew.PointC', 'Mean.Dew.PointC', 'Min.Dew.PointC']] = (nyc_weather[['Max.Dew.PointF', 'MeanDew.PointF', 'Min.DewpointF']]-32) / 1.8
nycISI.rename(columns={'Max.VisibilityMiles':'Max.VisibilityKm', 'Mean.VisibilityMiles' : 'Mean.VisibilityKm', 'Min.VisibilityMiles' : 'Min.VisibilityKm'}, inplace = True)
nycISI[['Max.VisibilityKm', 'Mean.VisibilityKm', 'Min.VisibilityKm']] = (nyc_weather[['Max.VisibilityMiles', 'Mean.VisibilityMiles', 'Min.VisibilityMiles']]) * 1.6
nycISI.rename(columns = {'Max.Wind.SpeedMPH': 'Max.Wind.SpeedKPH', 'Mean.Wind.SpeedMPH': 'Mean.Wind.SpeedKPH'}, inplace = True)
nycISI[['Max.Wind.SpeedKPH', 'Mean.Wind.SpeedKPH']] = nyc_weather[['Max.Wind.SpeedMPH', 'Mean.Wind.SpeedMPH']] * 1.6

# Part 1. Simple Statistical Analysis

## What is the distribution of temperature?

In [None]:
## are there any nan values in the mean temperature data?
nycISI['Mean.TemperatureC'].isnull().sum()

In [None]:
nycISI['Mean.TemperatureC'].describe()

In [None]:
print('The average temperature in NYC from {} to {} is {} C.'.format(nycISI['Date'][0], nycISI['Date'][-1:].values[0], round(nycISI['Mean.TemperatureC'].mean(), 3)))

In [None]:
plt.hist(nycISI['Mean.TemperatureC'].fillna(nycISI['Mean.TemperatureC'].mean())) # fillna to replace the nan with the mean temperature to avoid errors
plt.show()

### Average daily temperature variation in NYC 

In [None]:
plt.plot(nycISI['Mean.TemperatureC'])
plt.show()

## Average yearly temperature variation from 1949 to 2015

In [None]:
StartYear = int(nycISI['Date'].values[0][0:4]) + 1 # the +1 is to not include the first year (1948) whose data is only partially available
EndYear = int(nycISI['Date'].values[-1][0:4]) 
nycTemp = pd.DataFrame({'year' : range(StartYear, EndYear + 1)})
nycTemp = nycTemp[['year']]
mean_temp = []                        
for k in nycTemp['year']:
    is_year_equal_to_k = []
    for date in nycISI['Date']:
        is_year_equal_to_k.append(date.startswith(str(k)))
    mean_temp.append((nycISI['Mean.TemperatureC'][is_year_equal_to_k]).mean())
    is_year_equal_to_k = []
nycTemp['Mean Temp'] = mean_temp
plt.plot(nycTemp['year'], nycTemp['Mean Temp'])
plt.xlabel('Year')
plt.ylabel('Yearly Temperature \u00b0C')
plt.title('Yearly Average Temperature vs. Year')
plt.show()

## Global warming? 
### Calculate the 5-year temperature average

In [None]:
w = 5
mean_w_years_temp = []
start_index = 0
kRange = range(0, len(nycTemp['year']))
print(kRange)
for k in kRange:
    if k >= w :
        start_index = k - w
        end_index = k
    else:
        start_index = 0
        end_index = 4
    mean_w_years_temp.append((nycTemp['Mean Temp'][start_index:end_index]).mean())
nycTemp['5Years Mean Temperature'] = mean_w_years_temp
plt.plot(nycTemp['year'], nycTemp['Mean Temp'], label = 'yearly mean')
plt.plot(nycTemp['year'], nycTemp['5Years Mean Temperature'], label = '5 years mean')
plt.xlabel('Year')
plt.ylabel('Temperature \u00b0C')
plt.legend()
plt.title('Average Temperature vs. Year')
plt.show()

In [None]:
print('From {} to {} in NYC the 5-year average temperature has increased by {}\u00b0c! Is this a sign of global warming?'.format(nycTemp['year'].values[0], nycTemp['year'][-1:].values[0], round(nycTemp['5Years Mean Temperature'][-1:].values[0] - nycTemp['5Years Mean Temperature'][0], 2)))

In [None]:
year_max = nycTemp['year'][nycTemp['5Years Mean Temperature'] == nycTemp['5Years Mean Temperature'].max()].values[0]
print('The highest 5-year temperature average from {} to {} in NYC has occured in {}. Is this a sign of global warming?'.format(nycTemp['year'].values[0], nycTemp['year'][-1:].values[0], year_max))

###  What are the maximums/minimums and when did they happen? 
### Max wind speed ever recorded

In [None]:
max_windSpeed = nycISI['Max.Wind.SpeedKPH'].max()
date_maxWindSpeed = nycISI['Date'][nycISI['Max.Wind.SpeedKPH'] == max_windSpeed].values[0]
print('Maximum wind speed ever recorded in NYC since {} is {} KPH which occured on {}.'.format(nycISI['Date'].min(), round(max_windSpeed, 3), date_maxWindSpeed))

### Max temperature ever recorded

In [None]:
max_temp = nycISI['Max.TemperatureC'].max()
date_maxTemp = nycISI['Date'][nycISI['Max.TemperatureC'] == max_temp].values[0]
print('Maximum temperature ever recorded in NYC since {} is {} C which occured on {}.'.format(nycISI['Date'][0], round(max_temp, 3), date_maxTemp  ))

#### The maximum temperature ever recorded is very recent (> 2011). Is this a sign of climate change?

### Min temperature ever recorded

In [None]:
min_temp = nycISI['Min.TemperatureC'].min()
date_minTemp = nycISI['Date'][nycISI['Min.TemperatureC'] == min_temp].values[0]
print('Minimum temperature ever recorded in NYC since {} is {} C which occured on {}.'.format(nycISI['Date'][0], round(min_temp, 3), date_minTemp))