In [1]:
import pandas as pd
import numpy as np

In [2]:
weather= pd.read_csv('weather.csv')

In [3]:
weather['Station'].nunique()

2

In [4]:
weather.columns

Index(['Station', 'Date', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint',
       'WetBulb', 'Heat', 'Cool', 'Sunrise', 'Sunset', 'CodeSum', 'Depth',
       'Water1', 'SnowFall', 'PrecipTotal', 'StnPressure', 'SeaLevel',
       'ResultSpeed', 'ResultDir', 'AvgSpeed'],
      dtype='object')

In [5]:
weather.head(2)

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,...,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,...,,0,M,0.0,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,...,,M,M,M,0.0,29.18,29.82,2.7,25,9.6


In [6]:
# Checking for missing data

In [7]:
# Why isn't this working... basically want to check if rows in certain columns contain missing data 
for x in weather.columns:
    if 'M' in x:
        print('yes')

In [8]:
# Kihoon suggested this method, which is definitely more useful 
(weather[weather.columns] == 'M').sum().sort_values(ascending=False)

Water1         2944
Depart         1472
SnowFall       1472
Depth          1472
Tavg             11
Cool             11
Heat             11
SeaLevel          9
StnPressure       4
WetBulb           4
AvgSpeed          3
PrecipTotal       2
Date              0
Tmax              0
Tmin              0
Sunrise           0
DewPoint          0
ResultDir         0
Sunset            0
CodeSum           0
ResultSpeed       0
Station           0
dtype: int64

In [9]:
weather['Depth'].value_counts()

M    1472
0    1472
Name: Depth, dtype: int64

In [10]:
weather['SnowFall'].value_counts()

M      1472
0.0    1459
  T      12
0.1       1
Name: SnowFall, dtype: int64

In [11]:
# Given Water1 contains all missing values, and depth and snowfall also do not contain any useful information
# I'm going to go ahead and drop these columns 

In [12]:
weather= weather.drop(['Depth','SnowFall','Water1'], axis=1)

In [13]:
def date_separate(weather):
    weather = weather.copy()
    weather['Year'] = pd.DatetimeIndex(weather['Date']).year
    weather['Month'] = pd.DatetimeIndex(weather['Date']).month
    weather['Day'] = pd.DatetimeIndex(weather['Date']).day
    return weather

In [14]:
weather['CodeSum'].isnull().sum()

0

In [15]:
weather['CodeSum'].head(5)

0         
1         
2       BR
3    BR HZ
4         
Name: CodeSum, dtype: object

In [16]:
# I see there is missing data but when I call weather['CodeSum'].isnull().sum(), it gives me 0 
# Therefore, I realized something was weird and it was considering the space as something 

In [17]:
(weather[weather.columns] == ' ').sum().sort_values(ascending=False)

CodeSum        1609
AvgSpeed          0
Heat              0
Date              0
Tmax              0
Tmin              0
Tavg              0
Depart            0
DewPoint          0
WetBulb           0
Cool              0
ResultDir         0
Sunrise           0
Sunset            0
PrecipTotal       0
StnPressure       0
SeaLevel          0
ResultSpeed       0
Station           0
dtype: int64

In [18]:
weather['CodeSum'].value_counts()

# I am going to drop any type of CodeSum that doesn't appear >20 times. 

                         1609
RA                        296
RA BR                     238
BR                        110
TSRA RA BR                 92
BR HZ                      81
RA DZ BR                   65
TSRA RA                    43
HZ                         39
RA BR HZ                   38
TSRA                       34
RA DZ                      22
TSRA BR                    21
TS TSRA RA BR              19
RA HZ                      16
TS RA                      13
TSRA RA BR HZ              12
TS                         10
TS TSRA BR                 10
DZ BR                      10
DZ BR HZ                    9
TSRA BR HZ                  8
DZ                          8
TS RA BR                    8
RA DZ BR HZ                 7
TS TSRA RA                  7
TS TSRA                     7
TS TSRA BR HZ               7
TSRA HZ                     4
RA SN                       4
                         ... 
RA FG+ BR                   1
RA BR HZ VCTS               1
RA BR FU  

In [19]:
weather['Station'].value_counts()

1    1472
2    1472
Name: Station, dtype: int64

In [20]:
# First, I am going to drop all the rows that have a space 
weather = weather[~weather['CodeSum'].isin([' '])]

In [21]:
#Next, drop the rest
low = weather['CodeSum'].value_counts()
weather= weather[weather.isin(low.index[low >20]).values]

In [22]:
# Please let me know if you think this is not a smart idea, because maybe the largest number of trapped mosquitos 
# occur under special conditions that we should consider. I just figured that since those conditions are not as frequent
# we shouldn't really consider them..
weather['CodeSum'].value_counts()

RA            296
RA BR         238
BR            110
TSRA RA BR     92
BR HZ          81
RA DZ BR       65
TSRA RA        43
HZ             39
RA BR HZ       38
TSRA           34
RA DZ          22
TSRA BR        21
Name: CodeSum, dtype: int64

In [23]:
# We see that the most common weather conditions are Rain, Mist, Haze, Drizzle, and Thunderstorm and therefore 
# combinations of the sort as well 

In [24]:
weather=date_separate(weather)

In [25]:
weather.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,...,CodeSum,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,Year,Month,Day
2,1,2007-05-02,59,42,51,-3,42,47,14,0,...,BR,0.00,29.38,30.09,13.0,4,13.4,2007,5,2
3,2,2007-05-02,60,43,52,M,42,47,13,0,...,BR HZ,0.00,29.44,30.08,13.3,2,13.4,2007,5,2
5,2,2007-05-03,67,48,58,M,40,50,7,0,...,HZ,0.00,29.46,30.12,12.9,6,13.2,2007,5,3
6,1,2007-05-04,66,49,58,4,41,50,7,0,...,RA,T,29.31,30.05,10.4,8,10.8,2007,5,4
12,1,2007-05-07,83,47,65,10,41,54,0,0,...,RA,T,29.38,30.12,8.6,18,10.5,2007,5,7


In [26]:
weather['Station'].nunique()

2

In [27]:
weather.describe()

Unnamed: 0,Station,Tmax,Tmin,DewPoint,ResultSpeed,ResultDir,Year,Month,Day
count,1079.0,1079.0,1079.0,1079.0,1079.0,1079.0,1079.0,1079.0,1079.0
mean,1.497683,74.669138,58.178869,55.660797,7.139203,17.998146,2010.484708,7.526413,15.748842
std,0.500226,11.666074,9.995847,9.83292,3.746379,9.902306,2.281143,1.765732,8.902634
min,1.0,41.0,29.0,29.0,0.3,1.0,2007.0,5.0,1.0
25%,1.0,67.0,50.0,49.0,4.3,8.0,2009.0,6.0,8.0
50%,1.0,76.0,59.0,57.0,6.7,20.0,2010.0,8.0,16.0
75%,2.0,84.0,66.0,63.0,9.4,25.0,2013.0,9.0,23.0
max,2.0,103.0,81.0,74.0,24.1,36.0,2014.0,10.0,31.0


In [28]:
# Here I am dropping all the rows in Sunset that contain '-'
weather = weather[~weather['Sunset'].isin(['-'])]

In [29]:
weather['Sunset'].dtypes

dtype('O')

In [30]:
weather['Station'].nunique()

# After applying a filter that let's me see only the rows in Sunset that do not contain '-', the number of Stations 
# dropped to one..

1

In [31]:
# I was still confused that even when I dropped the columns with - in Sunset, it still was calling it a column of objects
# so I converted it to integer 
weather['Sunset']= weather.Sunset.astype(int)

In [32]:
weather['Sunset'].head(2)

2    1850
6    1852
Name: Sunset, dtype: int64

In [33]:
weather.dtypes

Station          int64
Date            object
Tmax             int64
Tmin             int64
Tavg            object
Depart          object
DewPoint         int64
WetBulb         object
Heat            object
Cool            object
Sunrise         object
Sunset           int64
CodeSum         object
PrecipTotal     object
StnPressure     object
SeaLevel        object
ResultSpeed    float64
ResultDir        int64
AvgSpeed        object
Year             int64
Month            int64
Day              int64
dtype: object

In [34]:
# Inspecting further on why this an object
weather['Tavg'].dtypes

dtype('O')

In [35]:
objects=[]
for index, x in enumerate(weather['Tavg']):
    if type(x) == object:
        print(objects.append(index))

In [36]:
# I'm not sure I understand why the list is showing up as none if Tavg still says dtypes= object 
objects

[]

In [37]:
# Is it fair to convert this then, just to integer
weather['Tavg']= weather.Tavg.astype(int)

In [38]:
weather.dtypes

Station          int64
Date            object
Tmax             int64
Tmin             int64
Tavg             int64
Depart          object
DewPoint         int64
WetBulb         object
Heat            object
Cool            object
Sunrise         object
Sunset           int64
CodeSum         object
PrecipTotal     object
StnPressure     object
SeaLevel        object
ResultSpeed    float64
ResultDir        int64
AvgSpeed        object
Year             int64
Month            int64
Day              int64
dtype: object

In [39]:
weather['AvgSpeed']=weather.AvgSpeed.astype(float)

In [40]:
weather['Heat']=weather.Heat.astype(int)

In [41]:
weather['Cool']=weather.Cool.astype(int)

In [42]:
weather.describe()

Unnamed: 0,Station,Tmax,Tmin,Tavg,DewPoint,Heat,Cool,Sunset,ResultSpeed,ResultDir,AvgSpeed,Year,Month,Day
count,542.0,542.0,542.0,542.0,542.0,542.0,542.0,542.0,542.0,542.0,542.0,542.0,542.0,542.0
mean,1.0,74.422509,57.234317,66.073801,55.557196,3.854244,4.928044,1841.972325,7.135978,18.147601,8.947417,2010.494465,7.54059,15.51107
std,0.0,11.649236,10.007506,10.405196,9.749328,6.096495,5.748293,88.990815,3.773947,9.984849,3.400862,2.284515,1.761473,8.919379
min,1.0,42.0,29.0,36.0,31.0,0.0,0.0,1647.0,0.3,1.0,2.4,2007.0,5.0,1.0
25%,1.0,67.0,49.25,59.0,49.0,0.0,0.0,1751.0,4.3,8.0,6.5,2009.0,6.0,8.0
50%,1.0,76.0,58.0,67.5,57.0,0.0,2.5,1857.0,6.5,20.0,8.4,2010.0,8.0,15.0
75%,1.0,83.0,65.0,74.0,63.0,6.0,9.0,1920.0,9.4,25.0,11.075,2013.0,9.0,23.0
max,1.0,103.0,81.0,91.0,74.0,29.0,26.0,1931.0,24.1,36.0,26.3,2014.0,10.0,31.0


In [43]:
(weather[weather.columns] == 'M').sum().sort_values(ascending=False)

SeaLevel       4
WetBulb        1
Day            0
Cool           0
Date           0
Tmax           0
Tmin           0
Tavg           0
Depart         0
DewPoint       0
Heat           0
Sunrise        0
Month          0
Sunset         0
CodeSum        0
PrecipTotal    0
StnPressure    0
ResultSpeed    0
ResultDir      0
AvgSpeed       0
Year           0
Station        0
dtype: int64

In [44]:
weather.replace(to_replace='M', value=np.nan, inplace=True)

In [52]:
# Replacing Ts in Precipitation total with an empty string 
weather.loc[weather['PrecipTotal'].str.contains('T')] = ''

In [54]:
# Convert empty to null values 
weather.replace(to_replace='', value=np.nan, inplace=True)

In [59]:
# Drop all Nan values 
weather= weather.dropna()