In [97]:
import pandas as pd

# Read in the csv file and select the columns we are interested in;
df = pd.read_csv("Knock_Airport_hourly_weather_data.csv", skiprows=23, low_memory=False, usecols=[0,2,4,10,12,14,17,20])

#However, the 'date' column is retuned as an 'object', therefore we need pandas to recognise it as a datetime.
#The 'date' is formated to datetime;
df['Datetime']= pd.to_datetime(df['date'])

df = df.iloc[3689:12449,1:]

#To rearrange the dataframe to have the datetime column first we call it as follows
df = df[['Datetime', 'rain', 'temp', 'msl', 'wdsp', 'wddir', 'sun', 'clamt']]



In [98]:
#So let's look at the data types we are working with.
df.dtypes

Datetime    datetime64[ns]
rain                object
temp               float64
msl                 object
wdsp                object
wddir               object
sun                float64
clamt               object
dtype: object

In [102]:
# To work out which columns are not numeric query the dtype as follows
df.dtypes.eq(object)

Datetime    False
rain         True
temp        False
msl          True
wdsp         True
wddir        True
sun         False
clamt        True
dtype: bool

In [103]:
#define the colums of 'objects'
cols = df.columns[df.dtypes.eq(object)]
cols

Index(['rain', 'msl', 'wdsp', 'wddir', 'clamt'], dtype='object')

In [104]:
# Now use the mask to filter colums and apply pd.to_numeric to convert the object type data to numeric
#extracted from https://stackoverflow.com/questions/25952790/convert-pandas-series-from-dtype-object-to-float-and-errors-to-nans/47942854
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce', axis=0)

In [106]:
df[cols].head()

Unnamed: 0,rain,msl,wdsp,wddir,clamt
3689,0.0,1031.2,7,350,1
3690,0.0,1031.4,9,30,1
3691,0.0,1031.7,7,20,1
3692,0.0,1031.7,8,20,1
3693,0.0,1031.7,7,10,1


In [111]:
#So now we should have some reference data to use as a guide in generating some random numbers!
Ref_data = df.describe()
Ref_data

Unnamed: 0,rain,temp,msl,wdsp,wddir,sun,clamt
count,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0
mean,0.143642,9.038664,1013.507717,9.651598,192.734018,0.136735,6.111644
std,0.493014,4.541594,13.229258,5.273074,85.787768,0.299735,2.304258
min,0.0,-3.8,968.3,0.0,0.0,0.0,0.0
25%,0.0,6.0,1005.6,6.0,130.0,0.0,5.0
50%,0.0,9.0,1015.2,9.0,190.0,0.0,7.0
75%,0.0,12.2,1023.4,13.0,260.0,0.0,8.0
max,8.0,23.7,1045.1,39.0,360.0,1.0,8.0


In [76]:
df.max()

Datetime    1997-12-31 23:00:00
rain                          8
temp                       23.7
msl                       999.9
wdsp                          9
wddir                        90
sun                           1
clamt                         8
dtype: object

In [78]:
df.min()

Datetime    1997-01-01 00:00:00
rain                          0
temp                       -3.8
msl                        1000
wdsp                          0
wddir                         0
sun                           0
clamt                         0
dtype: object

In [84]:
df.std()

temp    4.541594
sun     0.299735
dtype: float64

In [93]:
df.iloc[:,2].mean()

9.038664383561642

In [94]:
df['msl']

3689     1031.2
3690     1031.4
3691     1031.7
3692     1031.7
3693     1031.7
3694     1031.8
3695       1032
3696       1032
3697       1032
3698     1032.4
3699     1032.5
3700     1032.4
3701     1032.1
3702     1031.8
3703     1031.6
3704     1031.3
3705     1031.4
3706     1031.7
3707     1031.5
3708     1031.5
3709     1031.6
3710     1031.8
3711     1031.5
3712     1031.3
3713     1030.8
3714       1030
3715     1029.3
3716     1028.4
3717     1027.5
3718     1026.3
          ...  
12419       978
12420     978.7
12421     979.2
12422     981.4
12423     983.3
12424     984.5
12425       986
12426     987.2
12427     988.4
12428       990
12429     990.8
12430     991.2
12431     991.9
12432     993.1
12433     993.8
12434     994.5
12435     995.6
12436     995.9
12437     996.4
12438     996.6
12439     997.4
12440       998
12441     998.2
12442     998.5
12443     999.4
12444     999.7
12445    1000.1
12446    1000.6
12447      1001
12448    1000.4
Name: msl, Length: 8760,

In [95]:
pd.to_numeric('msl')

ValueError: Unable to parse string "msl" at position 0

In [None]:
#To tidy up the frame view we can set Datetime as the index as follows;
#df = df.set_index(['Datetime'])

In [None]:
#df.describe(exclude=['Datetime'])