In [2]:
#Station 1: CHICAGO O'HARE INTERNATIONAL AIRPORT Lat: 41.995 Lon: -87.933 Elev: 662 ft. above sea level
#Station 2: CHICAGO MIDWAY INTL ARPT Lat: 41.786 Lon: -87.752 Elev: 612 ft. above sea level

import pandas as pd
import numpy as np
import re


In [3]:
# Load file "weather.csv"
df = pd.read_csv('weather.csv')

# Drop columns:"Depth", "Water1", "Snowfall"
df = df.drop(columns = ['Depth','Water1','SnowFall'])


df.head()

df.columns

Index(['Station', 'Date', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint',
       'WetBulb', 'Heat', 'Cool', 'Sunrise', 'Sunset', 'CodeSum',
       'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed', 'ResultDir',
       'AvgSpeed'],
      dtype='object')

In [4]:
############ Lat and Lon Columns ############

# Add columns to show where the station is (lat,lon) and its reading measures
df['WeatherLat'] = np.where(df['Station']==1, 41.995, 41.786)
df['WeatherLon'] = np.where(df['Station']==1, -87.933, -87.752)

In [5]:
# Move newly added columns from the end 2 spots to after the Station column
my_column = df.pop('WeatherLat')
my_column1 = df.pop('WeatherLon')
df.insert(1, my_column.name, my_column)
df.insert(2, my_column1.name, my_column1)

In [6]:
############ Tavg Column ############

# Replace M in Tavg with Tmax-Tmin / 2 + Tmin
df['Tavg'].replace(to_replace='M',value = (df['Tmax']-df['Tmin'])/2 + df['Tmin'], inplace=True)

# Change Tavg type from object to int
df[["Tavg"]] = df[["Tavg"]].astype(int)

In [7]:
import datetime
def convert_to_datetime(time_str):
    try:
        return datetime.datetime.strptime(time_str,"%H%M").time()
    except Exception as e:
        print(time_str)
        print(e)

In [8]:

#Sunrise
df['sunrise_new']=df['Sunrise'].replace('-', '0000').apply(convert_to_datetime)
#df['sunrise_new1']=df['sunrise_new'].apply(convert_to_datetime)

#Sunset

df['sunset_new']=df['Sunset'].replace('-', '0000')
df['sunset_new']=df['sunset_new'].replace('1660', '1700')
df['sunset_new']=df['sunset_new'].replace('1760', '1800')
df['sunset_new']=df['sunset_new'].replace('1860', '1900')

df['sunset_new1']=df['sunset_new'].apply(convert_to_datetime)



print(df.head())

   Station  WeatherLat  WeatherLon        Date  Tmax  Tmin  Tavg Depart  \
0        1      41.995     -87.933  2007-05-01    83    50    67     14   
1        2      41.786     -87.752  2007-05-01    84    52    68      M   
2        1      41.995     -87.933  2007-05-02    59    42    51     -3   
3        2      41.786     -87.752  2007-05-02    60    43    52      M   
4        1      41.995     -87.933  2007-05-03    66    46    56      2   

   DewPoint WetBulb  ... CodeSum PrecipTotal StnPressure SeaLevel ResultSpeed  \
0        51      56  ...                0.00       29.10    29.82         1.7   
1        51      57  ...                0.00       29.18    29.82         2.7   
2        42      47  ...      BR        0.00       29.38    30.09        13.0   
3        42      47  ...   BR HZ        0.00       29.44    30.08        13.3   
4        40      48  ...                0.00       29.39    30.12        11.7   

  ResultDir AvgSpeed sunrise_new  sunset_new  sunset_new1  
0 

In [9]:
# Separate into two dataframes by station 1 and 2
is_station1 =  df['Station']==1
dfStation1 = df[is_station1]
is_station2 =  df['Station']==2
dfStation2 = df[is_station2]

# Join both dataframes next to each other on the Date and add respective station label to end
a = dfStation1.join(dfStation2.set_index('Date'), on='Date', lsuffix = '_Station1', rsuffix='_Station2')

# Reset the Index of the whole dataframe
a.reset_index(drop = True,inplace = True)

In [10]:
print(a.head())
############ Preciptotal Column ############

# Replace M value with value from Station on same date that actually has a value
a['PrecipTotal_Station1'].replace(to_replace='M',value = a['PrecipTotal_Station2'], inplace=True)
a['PrecipTotal_Station2'].replace(to_replace='M',value = a['PrecipTotal_Station1'], inplace=True)

# Replace T value with value from Station on same date that actually has a value
a['PrecipTotal_Station1'].replace(to_replace='  T',value = a['PrecipTotal_Station2'], inplace=True)
a['PrecipTotal_Station2'].replace(to_replace='  T',value = a['PrecipTotal_Station1'], inplace=True)


   Station_Station1  WeatherLat_Station1  WeatherLon_Station1        Date  \
0                 1               41.995              -87.933  2007-05-01   
1                 1               41.995              -87.933  2007-05-02   
2                 1               41.995              -87.933  2007-05-03   
3                 1               41.995              -87.933  2007-05-04   
4                 1               41.995              -87.933  2007-05-05   

   Tmax_Station1  Tmin_Station1  Tavg_Station1 Depart_Station1  \
0             83             50             67              14   
1             59             42             51              -3   
2             66             46             56               2   
3             66             49             58               4   
4             66             53             60               5   

   DewPoint_Station1 WetBulb_Station1  ... CodeSum_Station2  \
0                 51               56  ...                    
1             

In [11]:

# Now that the only T's remaining are where its in both Station1 and 2, replace with 0
a['PrecipTotal_Station1'].replace(to_replace='  T',value = 0.00, inplace=True)
a['PrecipTotal_Station2'].replace(to_replace='  T',value = 0.00, inplace=True)

# Change PrecipTotal types to floats from objects
a[["PrecipTotal_Station1"]] = a[["PrecipTotal_Station1"]].astype(float)
a[["PrecipTotal_Station2"]] = a[["PrecipTotal_Station2"]].astype(float)

############ STNpressure Column ############

# Replace M value with value from Station on same date that actually has a value
a['StnPressure_Station1'].replace(to_replace='M',value = a['StnPressure_Station2'], inplace=True)
a['StnPressure_Station2'].replace(to_replace='M',value = a['StnPressure_Station1'], inplace=True)

# Find Indexes of spots where both Station 1 and Station 2 have value "M" and replace with avg of index above and below
Index_label1 = a[a['StnPressure_Station1']=='M'].index.tolist()
for i in Index_label1:
    a['StnPressure_Station2'].replace(to_replace='M',value = (float(a.at[i+1,'StnPressure_Station2']) + float(a.at[i-1,'StnPressure_Station2']))/2 , inplace=True)
    a['StnPressure_Station1'].replace(to_replace='M', value=(float(a.at[i+1, 'StnPressure_Station1']) + float(a.at[i-1, 'StnPressure_Station1'])) / 2, inplace=True)

# Change StnPressure types to floats from objects
a[["StnPressure_Station1"]] = a[["StnPressure_Station1"]].astype(float)
a[["StnPressure_Station2"]] = a[["StnPressure_Station2"]].astype(float)

############ Sealevel Column ############

# Replace M value with value from Station on same date that actually has a value
a['SeaLevel_Station1'].replace(to_replace='M',value = a['SeaLevel_Station2'], inplace=True)
a['SeaLevel_Station2'].replace(to_replace='M',value = a['SeaLevel_Station1'], inplace=True)

# Change Sealevel types to floats from objects
a[["SeaLevel_Station1"]] = a[["SeaLevel_Station1"]].astype(float)
a[["SeaLevel_Station2"]] = a[["SeaLevel_Station2"]].astype(float)

############ AvgSpeed Column ############

# Replace M value with value from Station on same date that actually has a value
a['AvgSpeed_Station1'].replace(to_replace='M',value = a['AvgSpeed_Station2'], inplace=True)
a['AvgSpeed_Station2'].replace(to_replace='M',value = a['AvgSpeed_Station1'], inplace=True)

# Change AvgSpeed types to floats from objects
a[["AvgSpeed_Station1"]] = a[["AvgSpeed_Station1"]].astype(float)
a[["AvgSpeed_Station2"]] = a[["AvgSpeed_Station2"]].astype(float)

In [12]:
a['Depart_Station2'] = np.where((a.Depart_Station2 == 'M'),
                                            a.Depart_Station1, a.Depart_Station2)

In [13]:
for col in ['Heat_Station2']:
    a[col] = a[col].fillna(a['Heat_Station1'])

In [14]:
a['Heat_Station2'] = np.where((a.Heat_Station2 == 'M'),
                                            a.Heat_Station1, a.Heat_Station2)


In [15]:
a['WetBulb_Station1']= pd.to_numeric(a['WetBulb_Station1'], errors = 'coerce')
# calculate mean of WetBulb_Station1 = WetBulb_Station1_mean
WetBulb_Station1_mean = a['WetBulb_Station1'].mean(skipna = True)

In [16]:
#replace the missing value of WetBulb_Station1 with the WetBulb_Station1_mean
a['WetBulb_Station1'] = a.WetBulb_Station1.fillna(WetBulb_Station1_mean)

In [17]:
#change the type of Wetbulb_Station2 Column to numeric 
a['WetBulb_Station2']= pd.to_numeric(a['WetBulb_Station2'], errors = 'coerce')
# calculate mean of WetBulb_Station2
WetBulb_Station2_mean = a['WetBulb_Station2'].mean(skipna = True)
a['WetBulb_Station2'] = a.WetBulb_Station2.fillna(WetBulb_Station2_mean)


In [18]:
def to_celcius(x):
    c = ((x-32)/9)*5
    return(c)

def rel_hum(dry,wet,press=0.6687451584):
    e = float(math.e)
    ed = 6.112*(e**((17.502*dry)/(240.97+dry)))
    ew = 6.112*(e**((17.502*wet)/(240.97+wet)))
    result = (ew-press*(1+.00115*wet)*(dry-wet))/ed*100
    return(result)

In [19]:
a['Tavg_Station1']= pd.to_numeric(a['Tavg_Station1'], errors = 'coerce')
a['Tavg_Station2']= pd.to_numeric(a['Tavg_Station2'], errors = 'coerce')
#final_station['StnPressure_Station1']= pd.to_numeric(final_station['StnPressure_Station1'], errors = 'coerce')

In [20]:
import math
a['WetBulb_Station1_c']=a.WetBulb_Station1.apply(to_celcius)
a['Tavg_Station1_c']=a.Tavg_Station1.apply(to_celcius)

a['WetBulb_Station2_c']=a.WetBulb_Station2.apply(to_celcius)
a['Tavg_Station2_c']=a.Tavg_Station2.apply(to_celcius)

a['rel_hum_station1']=rel_hum(a['Tavg_Station1_c'],a['WetBulb_Station1_c'])
a['rel_hum_station2']=rel_hum(a['Tavg_Station2_c'],a['WetBulb_Station2_c'])

In [59]:
# replacing station 2 sunset and sunrise with station 1 values
a[["sunrise_new_Station2"]] = a[["sunrise_new_Station2"]].astype(str)
a[["sunset_new1_Station2"]] = a[["sunset_new1_Station2"]].astype(str)
a['sunrise_new_Station2'].replace(to_replace='00:00:00',value = a['sunrise_new_Station1'], inplace=True)
a['sunset_new1_Station2'].replace(to_replace='00:00:00',value = a['sunset_new1_Station1'], inplace=True)

#drop sunrise and sunset columns not needed
a = a.drop(columns = ['Sunrise_Station1','Sunset_Station1','sunset_new_Station1','sunset_new_Station2'])

#rename sunrise and sunset columns 
a.rename(columns={"sunrise_new_Station1":"Sunrise_Station1","sunset_new1_Station1":"Sunset_Station1",
                  "sunrise_new_Station2":"Sunrise_Station2","sunset_new1_Station2":"Sunset_Station2"},inplace = True)    

#codeSum
a['CodeSum_Station1']=a['CodeSum_Station1'].replace(' ', 'NO EVENT')
a['CodeSum_Station2']=a['CodeSum_Station2'].replace(' ', 'NO EVENT')    


#cool
a['Cool_Station2'] = np.where((a.Cool_Station2 == 'M'),a.Cool_Station1, a.Cool_Station2)


Station_Station1
WeatherLat_Station1
WeatherLon_Station1
Date
Tmax_Station1
Tmin_Station1
Tavg_Station1
Depart_Station1
DewPoint_Station1
WetBulb_Station1
Heat_Station1
Cool_Station1
CodeSum_Station1
PrecipTotal_Station1
StnPressure_Station1
SeaLevel_Station1
ResultSpeed_Station1
ResultDir_Station1
AvgSpeed_Station1
Sunrise_Station1
Sunset_Station1
Station_Station2
WeatherLat_Station2
WeatherLon_Station2
Tmax_Station2
Tmin_Station2
Tavg_Station2
Depart_Station2
DewPoint_Station2
WetBulb_Station2
Heat_Station2
Cool_Station2
Sunrise_Station2
Sunset_Station2
CodeSum_Station2
PrecipTotal_Station2
StnPressure_Station2
SeaLevel_Station2
ResultSpeed_Station2
ResultDir_Station2
AvgSpeed_Station2
Sunrise_Station2
Sunset_Station2
WetBulb_Station1_c
Tavg_Station1_c
WetBulb_Station2_c
Tavg_Station2_c
rel_hum_station1
rel_hum_station2


In [63]:
a.head()

Unnamed: 0,Station_Station1,WeatherLat_Station1,WeatherLon_Station1,Date,Tmax_Station1,Tmin_Station1,Tavg_Station1,Depart_Station1,DewPoint_Station1,WetBulb_Station1,...,ResultDir_Station2,AvgSpeed_Station2,Sunrise_Station2,Sunset_Station2,WetBulb_Station1_c,Tavg_Station1_c,WetBulb_Station2_c,Tavg_Station2_c,rel_hum_station1,rel_hum_station2
0,1,41.995,-87.933,2007-05-01,83,50,67,14,51,56.0,...,25,9.6,04:48:00,18:49:00,13.333333,19.444444,13.888889,20.0,49.384724,50.109905
1,1,41.995,-87.933,2007-05-02,59,42,51,-3,42,47.0,...,2,13.4,04:47:00,18:50:00,8.333333,10.555556,8.333333,11.111111,74.340559,68.806204
2,1,41.995,-87.933,2007-05-03,66,46,56,2,40,48.0,...,6,13.2,04:46:00,18:51:00,8.888889,13.333333,10.0,14.444444,54.829362,56.364791
3,1,41.995,-87.933,2007-05-04,66,49,58,4,41,50.0,...,7,10.4,04:44:00,18:52:00,10.0,14.444444,10.0,17.777778,56.364791,34.480786
4,1,41.995,-87.933,2007-05-05,66,53,60,5,38,49.0,...,7,11.5,04:43:00,18:53:00,9.444444,15.555556,10.0,15.555556,43.563758,48.219521


In [21]:
#a.to_csv('/Users/jenniferwu/Documents/kaggle-competition1/cleaned_data/weather-cleaned.csv')

In [22]:
codesum = list()
for i in a.CodeSum_Station2:
    if i == 'NO EVENT':
        v = ['NO EVENT']
    else:
        v = i.split(' ')  
    codesum.append(v)

In [337]:
s = set()
for item in codesum:
    s.update(item)
    
print(list(s))
len(list(s))


"""for i in list(s):
    a[i] = 0"""

a = a.rename(columns = {'DZ': 'DZ1',
                    'FG' : 'FG1', 
                    'VCFG' :'VCFG1', 
                    'GR': 'GR1', 
                    'BR':'BR1', 
                    'TS':'TS1', 
                    'SG1':'SQ1', 
                    'BCFG':'BCFG1',
                    'SN':'SN1', 
                    'HZ':'HZ1', 
                    'TSRA':'TSRA1', 
                    'VCTS': 'VCTS1', 
                    'FG+':'FG+1', 
                    'RA':'RA1', 
                    'NO EVENT':'NO EVENT1',
                    'FU': 'FU1'})


"""for i in list(s):
    a[i+"2"] = 0"""

['DZ', 'FG', 'VCFG', 'GR', 'BR', 'TS', 'SQ', 'BCFG', 'SN', 'HZ', 'TSRA', 'VCTS', 'FG+', 'RA', 'NO EVENT', 'FU']


Unnamed: 0,Station_Station1,WeatherLat_Station1,WeatherLon_Station1,Date,Tmax_Station1,Tmin_Station1,Tavg_Station1,Depart_Station1,DewPoint_Station1,WetBulb_Station1,...,SQ2,BCFG2,SN2,HZ2,TSRA2,VCTS2,FG+2,RA2,NO EVENT2,FU2
0,1,41.995,-87.933,2007-05-01,83,50,67,14,51,56.0,...,0,0,0,0,0,0,0,0,0,0
1,1,41.995,-87.933,2007-05-02,59,42,51,-3,42,47.0,...,0,0,0,0,0,0,0,0,0,0
2,1,41.995,-87.933,2007-05-03,66,46,56,2,40,48.0,...,0,0,0,0,0,0,0,0,0,0
3,1,41.995,-87.933,2007-05-04,66,49,58,4,41,50.0,...,0,0,0,0,0,0,0,0,0,0
4,1,41.995,-87.933,2007-05-05,66,53,60,5,38,49.0,...,0,0,0,0,0,0,0,0,0,0


In [334]:

for i in range(len(a.CodeSum_Station1)):
    if re.findall(list(s)[0],a.CodeSum_Station1[i]):
        a.DZ1[i] = 1
    else:
        a.DZ1[i] = 0

for i in range(len(a.CodeSum_Station1)):
    if re.findall(list(s)[1],a.CodeSum_Station1[i]):
        a.FG1[i] = 1
    else:
        a.FG1[i] = 0
        
for i in range(len(a.CodeSum_Station1)):
    if re.findall(list(s)[2],a.CodeSum_Station1[i]):
        a.VCFG1[i] = 1
    else:
        a.VCFG1[i] = 0      
        
for i in range(len(a.CodeSum_Station1)):
    if re.findall(list(s)[3],a.CodeSum_Station1[i]):
        a.GR1[i] = 1
    else:
        a.GR1[i] = 0      
        
for i in range(len(a.CodeSum_Station1)):
    if re.findall(list(s)[4],a.CodeSum_Station1[i]):
        a.BR1[i] = 1
    else:
        a.BR1[i] = 0     
        
for i in range(len(a.CodeSum_Station1)):
    if re.findall(list(s)[5],a.CodeSum_Station1[i]):
        a.TS1[i] = 1
    else:
        a.TS1[i] = 0      

for i in range(len(a.CodeSum_Station1)):
    if re.findall(list(s)[6],a.CodeSum_Station1[i]):
        a.SQ1[i] = 1
    else:
        a.SQ1[i] = 0             

for i in range(len(a.CodeSum_Station1)):
    if re.findall(list(s)[7],a.CodeSum_Station1[i]):
        a.BCFG1[i] = 1
    else:
        a.BCFG1[i] = 0    
        
for i in range(len(a.CodeSum_Station1)):
    if re.findall(list(s)[8],a.CodeSum_Station1[i]):
        a.SN1[i] = 1
    else:
        a.SN1[i] = 0  
        
for i in range(len(a.CodeSum_Station1)):
    if re.findall(list(s)[9],a.CodeSum_Station1[i]):
        a.HZ1[i] = 1
    else:
        a.HZ1[i] = 0   

for i in range(len(a.CodeSum_Station1)):
    if re.findall(list(s)[10],a.CodeSum_Station1[i]):
        a.TSRA1[i] = 1
    else:
        a.TSRA1[i] = 0           
        
for i in range(len(a.CodeSum_Station1)):
    if re.findall(list(s)[11],a.CodeSum_Station1[i]):
        a.VCTS1[i] = 1
    else:
        a.VCTS1[i] = 0    
        
for i in range(len(a.CodeSum_Station1)):
    if re.findall(list(s)[12],a.CodeSum_Station1[i]):
        a['FG+1'][i] = 1
    else:
        a['FG+1'][i] = 0   
        
for i in range(len(a.CodeSum_Station1)):
    if re.findall(list(s)[13],a.CodeSum_Station1[i]):
        a.RA1[i] = 1
    else:
        a.RA1[i] = 0  
        
for i in range(len(a.CodeSum_Station1)):
    if re.findall(list(s)[14],a.CodeSum_Station1[i]):
        a['NO EVENT1'][i] = 1
    else:
        a['NO EVENT1'][i] = 0 

for i in range(len(a.CodeSum_Station1)):
    if re.findall(list(s)[15],a.CodeSum_Station1[i]):
        a['FU1'][i] = 1
    else:
        a['FU1'][i] = 0         

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in

In [338]:

for i in range(len(a.CodeSum_Station2)):
    if re.findall(list(s)[0],a.CodeSum_Station2[i]):
        a.DZ2[i] = 1
    else:
        a.DZ2[i] = 0

for i in range(len(a.CodeSum_Station2)):
    if re.findall(list(s)[2],a.CodeSum_Station2[i]):
        a.FG2[i] = 1
    else:
        a.FG2[i] = 0
        
for i in range(len(a.CodeSum_Station2)):
    if re.findall(list(s)[2],a.CodeSum_Station2[i]):
        a.VCFG2[i] = 1
    else:
        a.VCFG2[i] = 0      
        
for i in range(len(a.CodeSum_Station2)):
    if re.findall(list(s)[3],a.CodeSum_Station2[i]):
        a.GR2[i] = 1
    else:
        a.GR2[i] = 0      
        
for i in range(len(a.CodeSum_Station2)):
    if re.findall(list(s)[4],a.CodeSum_Station2[i]):
        a.BR2[i] = 1
    else:
        a.BR2[i] = 0     
        
for i in range(len(a.CodeSum_Station2)):
    if re.findall(list(s)[5],a.CodeSum_Station2[i]):
        a.TS2[i] = 1
    else:
        a.TS2[i] = 0      

for i in range(len(a.CodeSum_Station2)):
    if re.findall(list(s)[6],a.CodeSum_Station2[i]):
        a.SQ2[i] = 1
    else:
        a.SQ2[i] = 0             

for i in range(len(a.CodeSum_Station2)):
    if re.findall(list(s)[7],a.CodeSum_Station2[i]):
        a.BCFG2[i] = 1
    else:
        a.BCFG2[i] = 0    
        
for i in range(len(a.CodeSum_Station2)):
    if re.findall(list(s)[8],a.CodeSum_Station2[i]):
        a.SN2[i] = 1
    else:
        a.SN2[i] = 0  
        
for i in range(len(a.CodeSum_Station2)):
    if re.findall(list(s)[9],a.CodeSum_Station2[i]):
        a.HZ2[i] = 1
    else:
        a.HZ2[i] = 0   

for i in range(len(a.CodeSum_Station2)):
    if re.findall(list(s)[20],a.CodeSum_Station2[i]):
        a.TSRA2[i] = 1
    else:
        a.TSRA2[i] = 0           
        
for i in range(len(a.CodeSum_Station2)):
    if re.findall(list(s)[22],a.CodeSum_Station2[i]):
        a.VCTS2[i] = 1
    else:
        a.VCTS2[i] = 0    
        
for i in range(len(a.CodeSum_Station2)):
    if re.findall(list(s)[22],a.CodeSum_Station2[i]):
        a['FG+2'][i] = 1
    else:
        a['FG+2'][i] = 0   
        
for i in range(len(a.CodeSum_Station2)):
    if re.findall(list(s)[23],a.CodeSum_Station2[i]):
        a.RA2[i] = 1
    else:
        a.RA2[i] = 0  
        
for i in range(len(a.CodeSum_Station2)):
    if re.findall(list(s)[24],a.CodeSum_Station2[i]):
        a['NO EVENT2'][i] = 1
    else:
        a['NO EVENT2'][i] = 0 

for i in range(len(a.CodeSum_Station2)):
    if re.findall(list(s)[25],a.CodeSum_Station2[i]):
        a['FU2'][i] = 1
    else:
        a['FU2'][i] = 0          

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A val

IndexError: list index out of range

In [340]:
a.SQ2.value_counts()

0    1470
1       2
Name: SQ2, dtype: int64

In [341]:
a.to_csv('/Users/jenniferwu/Documents/kaggle-competition1/cleaned_data/weather_var_cleaned.csv')