# Exploratory Data Analysis

In this section of the notebook, I will be exploring the data and answering the following questions:

   1. Is there something intereseting to count?
   2. Are there any trends (e.g. high, low, increasing, decreasing, anomalies)?
   3. Are there any valuable comparisons between two related quantities?
  
I used histograms, bar plots, scatterplots, and time-series plots to answer the following questions:

   4. Are there any insights from the data?
   5. Are there any correlations? 
   6. What is a hypothesis that can be taken further?
   7. What other questions arise from these insights and correlations?
   
After answering these questions, I provide a link to a presentation that uses text and plots to tell the compelling story of my data.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import statsmodels.api as sm
import visualization as vz
import warnings
from sklearn.model_selection import train_test_split
from textwrap import wrap
from itertools import combinations
from scipy import stats
from datetime import datetime
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf
from sklearn.metrics import mean_absolute_error
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

file = '../data/manipulated_data/rainfalldata.csv'
rd = pd.read_csv(file)
file2 = '../data/manipulated_data/ncrainfalldata.csv'
ncrd = pd.read_csv(file2)
rd.Date = pd.to_datetime(rd.Date)
rd = rd.set_index('Date')
ncrd.Date = pd.to_datetime(ncrd.Date)
ncrd = ncrd.set_index('Date')

ModuleNotFoundError: No module named 'visualization'

In [None]:
rd.head()

In [None]:
ncrd.head()

In [None]:
monthavg = []
monthsem = []
monthstd = []
for i in range(1,13):
    monthavg.append(np.mean(rd['Raleigh, NC'][rd.index.month == i]))
    monthsem.append(stats.sem(rd['Raleigh, NC'][rd.index.month == i]))
    monthstd.append(np.std(rd['Raleigh, NC'][rd.index.month == i]))
print(monthstd)
    
fig, ax = plt.subplots()
ax.bar(rd.index.month.unique(), monthavg, yerr = monthsem, alpha=0.5, ecolor='black', capsize=10)
ax.set_title('Average Monthly Rainfall in Raleigh, NC from 1956 to 2019')
ax.set_xlabel('Month')
ax.set_ylabel('Rainfall (in)')
plt.tight_layout()
plt.savefig('raleighmonthly.jpg')
plt.show()

In [None]:
rdyearavg = rd.resample('Y').mean()
rdyearavg.head()

In [None]:
def yearlyavgfigs(df, loc, **keyword_parameters):
    plt.figure(figsize = (400/96, 400/96),dpi=96)
    if len(loc) == 1:
        if ('color' in keyword_parameters):
            plt.plot(df.index.year, df[loc[0]], keyword_parameters['color'])
        else:
            plt.plot(df.index.year, df[loc[0]])
        plt.title('Average Yearly Rainfall in ' + loc[0] + ' from 1980 to 2019')
    else:
        plt.plot(df.index.year, df[loc[0]])
        plt.plot(df.index.year, df[loc[1]])
        plt.title("\n".join(wrap('Average Yearly Rainfall in ' + loc[0] + '-' + loc[1] + ' from 1980 to 2019', 60)))
    plt.xlabel('Year')
    plt.ylabel('Rainfall (in)')
    plt.show()
yearlyavgfigs(rdyearavg, ['Raleigh, NC'])
yearlyavgfigs(rdyearavg, ['Greensboro AP, NC'], color='orange')
yearlyavgfigs(rdyearavg, ['Raleigh, NC', 'Greensboro AP, NC'])

# Seasonal Decomposition

In [3]:
plt.rcParams["figure.figsize"] = (50,50)
plt.rcParams["font.size"] = 32.0