In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)



from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))



import matplotlib.pyplot as plt
import scipy.stats
import seaborn as sns
import datetime
from pylab import rcParams


In [None]:
df = pd.read_csv('../input/szeged-weather/weatherHistory.csv')
df.head(3)

In [None]:
df.info()

In [None]:
#Categorical variables:
categorical = df.select_dtypes(include = ["object"]).keys()
print(categorical)

In [None]:
#Quantitative variables:
quantitative = df.select_dtypes(include = ["int64","float64"]).keys()
print(quantitative)

In [None]:
df['Date'] = pd.to_datetime(df['Formatted Date'], utc=True, format='%Y%m%d %H')
#df['Date'] = pd.to_datetime(df['Formatted Date'])
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day
df['hour'] = df['Date'].dt.hour

In [None]:
df.info()

In [None]:
df[quantitative].describe()

In [None]:
rcParams['figure.figsize'] = 8, 8
df[quantitative].hist()

In [None]:
def plotCorrelationMatrix(df, graphWidth):
    filename = df.dataframeName
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.show()
    

nRowsRead = None # specify 'None' if want to read whole file

df1 = pd.read_csv('../input/szeged-weather/weatherHistory.csv', delimiter=',', nrows = nRowsRead)
df1.dataframeName = 'Szeged Weather Data.csv'
nRow, nCol = df1.shape
print(f'There are {nRow} rows and {nCol} columns')    

plotCorrelationMatrix(df1, 16)

In [None]:
df=df.drop('Loud Cover',axis=1)

In [None]:
pressure_median = df['Pressure (millibars)'].median()
      
def pressure(x):
    if x==0:
        return x + pressure_median
    else:
        return x
        
df["Pressure (millibars)"] = df.apply(lambda row:pressure(row["Pressure (millibars)"]) , axis = 1)

rcParams['figure.figsize'] = 5, 3
df['Pressure (millibars)'].hist()

In [None]:
rcParams['figure.figsize'] = 8, 5
sns.countplot(y=df['Summary'])
len(df['Summary'].unique()) #How many different 'Summary' categories are there?

In [None]:
summary_freq=pd.crosstab(index=df['Summary'],columns="count")  
summary_freq_rel = summary_freq/summary_freq.sum() 
summary_freq_rel.sort_values('count', ascending=False) #relative frequencies

In [None]:
#new categorical variable:
def cloud_categorizer(row):
   row = str(row).lower()
   category = ""
   if "foggy" in row:
       category = 5
   elif "overcast" in row:
       category = 4
   elif "mostly cloudy" in row:
       category = 3
   elif "partly cloudy" in row:
       category = 2
   elif "clear" in row:
       category = 1
   else:
       category = 0
   return category 

df["cloud (summary)"] = df.apply (lambda row:cloud_categorizer(row["Summary"]) , axis = 1)

In [None]:
rcParams['figure.figsize'] = 5, 5
sns.countplot(df['cloud (summary)']) 


In [None]:
sns.boxplot(x=df['cloud (summary)'], y=df['Visibility (km)']) 

In [None]:
def cloud_categorizer(row):
    row = str(row).lower()
    category = ""
    if "foggy" in row:
        category = 5
    elif "overcast" in row:
        category = 4
    elif "mostly cloudy" in row:
        category = 3
    elif "partly cloudy" in row:
        category = 2
    elif "clear" in row:
        category = 1
    else:
        category = 4 
    return category 

df["cloud (summary)"] = df.apply (lambda row:cloud_categorizer(row["Summary"]) , axis = 1)

In [None]:
ax=sns.countplot(df['cloud (summary)'])
rcParams['figure.figsize'] = 16, 10
ax.set_xticklabels(('1=Clear', '2=Partly Cloudy', '3=Mostly Cloudy', '4=Overcast', '5=Foggy'))

In [None]:
len(df['Daily Summary'].unique()) #number of categories

In [None]:
daily_summary_freq =pd.crosstab(index=df['Daily Summary'],columns="count") 
daily_summary_freqrel=daily_summary_freq/daily_summary_freq.sum()
daily_summary_freqrel.sort_values('count', ascending=False).head(10)#Show the 10 most common categories

In [None]:
#Let's create a new variable called 'cloud (daily summary)' using the same function we created for 'cloud (summary)'

df["cloud (daily summary)"] = df.apply (lambda row:cloud_categorizer(row["Daily Summary"]) , axis = 1)
rcParams['figure.figsize'] = 8, 5
ax=sns.countplot(df['cloud (daily summary)'])
ax.set_xticklabels(('1=Clear', '2=Partly Cloudy', '3=Mostly Cloudy', '4=Overcast', '5=Foggy'))

In [None]:
#Drawing a heatmap
def facet_heatmap(data, color, **kws):
    values=data.columns.values[3]
    data = data.pivot(index='hour', columns='day', values=values)
    sns.heatmap(data, cmap='coolwarm', **kws)  

#Joining heatmaps of every month in a year 
def weather_calendar(year,weather): #Year= Any year in DataFrame. Weather=Any quantitative variable
    dfyear = df[df['year']==year][['month', 'day', 'hour', weather]]
    vmin=dfyear[weather].min()
    vmax=dfyear[weather].max()
    with sns.plotting_context(font_scale=12):
        g = sns.FacetGrid(dfyear,col="month", col_wrap=3) #One heatmap per month
        g = g.map_dataframe(facet_heatmap,vmin=vmin, vmax=vmax)
        g.set_axis_labels('Day', 'Hour')
        plt.subplots_adjust(top=0.9)
        g.fig.suptitle('%s Calendar. Year: %s.' %(weather, year), fontsize=18)

In [None]:
weather_calendar(2006,'Temperature (C)')

In [None]:
weather_calendar(2008,'Wind Speed (km/h)')

In [None]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
def plotScatterMatrix(df, plotSize, textSize):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    # Remove rows and columns that would lead to df being singular
    df = df.dropna('columns')
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    columnNames = list(df)
    if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
        columnNames = columnNames[:10]
    df = df[columnNames]
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
    corrs = df.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
    plt.suptitle('Scatter and Density Plot')
    plt.show()
nRowsRead = None # specify 'None' if want to read whole file
# Istanbul Weather Data.csv has 3854 rows in reality, but we are only loading/previewing the first 1000 rows
df2 = pd.read_csv('../input/szeged-weather/weatherHistory.csv', delimiter=',', nrows = nRowsRead)
df2.dataframeName = 'Szeged Weather Data.csv'
nRow, nCol = df2.shape
#print(f'There are {nRow} rows and {nCol} columns')    

plotScatterMatrix(df2, 18, 10)