In [11]:
#importing the necessary libraries
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [25]:
#creating a pandas DataFrame from a CSV file
data = pd.read_csv("data_weather.csv")
print("columns are:",data.columns)
print("Data:\n", data)


columns are: Index(['number', 'air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am', 'relative_humidity_9am',
       'relative_humidity_3pm'],
      dtype='object')
Data:
       number  air_pressure_9am  air_temp_9am  avg_wind_direction_9am  \
0          0        918.060000     74.822000              271.100000   
1          1        917.347688     71.403843              101.935179   
2          2        923.040000     60.638000               51.000000   
3          3        920.502751     70.138895              198.832133   
4          4        921.160000     44.294000              277.800000   
...      ...               ...           ...                     ...   
1090    1090        918.900000     63.104000              192.900000   
1091    1091        918.710000     49.568000              241.600000   
1092    1092        916.600000     71.09600

In [24]:
print("null data:\n",data[data.isnull().any(axis=1)])

null data:
       number  air_pressure_9am  air_temp_9am  avg_wind_direction_9am  \
16        16        917.890000           NaN              169.200000   
111      111        915.290000     58.820000              182.600000   
177      177        915.900000           NaN              183.300000   
262      262        923.596607     58.380598               47.737753   
277      277        920.480000     62.600000              194.400000   
334      334        916.230000     75.740000              149.100000   
358      358        917.440000     58.514000               55.100000   
361      361        920.444946     65.801845               49.823346   
381      381        918.480000     66.542000               90.900000   
409      409               NaN     67.853833               65.880616   
517      517        920.570000     53.600000              100.100000   
519      519        916.250000     55.670000              176.400000   
546      546               NaN     42.746000        

In [26]:
#Data cleaning Steps
del data['number']
#Let us drop null values using the pandas dropna function.
before_rows = data.shape[0]
print(before_rows)
data = data.dropna()
after_rows = data.shape[0]
print(after_rows)


1095
1064


In [27]:
#how many rows dropped due to cleaning?
print("Total rows dropped:",before_rows - after_rows)

Total rows dropped: 31


In [29]:
#convert to a classification task
#Binarize the relative_humidity_3pm to 0 or 1.
clean_data=data.copy()
clean_data["high_humidity_label"]=(clean_data["relative_humidity_3pm"]>24.99)*1
print(clean_data["high_humidity_label"])

0       1
1       0
2       0
3       0
4       1
       ..
1090    1
1091    1
1092    1
1093    1
1094    0
Name: high_humidity_label, Length: 1064, dtype: int32


In [30]:
#Target is stored in 'y'.
Y=clean_data[["high_humidity_label"]].copy()
clean_data["relative_humidity_3pm"].head()
print("Y Data:\n",Y.head())

Y Data:
    high_humidity_label
0                    1
1                    0
2                    0
3                    0
4                    1


In [31]:
#use 9am sensor singals as features to predict humidity at 3pm
morning_features =["air_pressure_9am","air_temp_9am","avg_wind_direction_9am","avg_wind_speed_9am",
                   "max_wind_direction_9am","max_wind_speed_9am","rain_accumulation_9am","rain_duration_9am"]
X = clean_data[morning_features].copy()
print("columns in X:",X.columns)
print("columns in Y:",Y.columns)

columns in X: Index(['air_pressure_9am', 'air_temp_9am', 'avg_wind_direction_9am',
       'avg_wind_speed_9am', 'max_wind_direction_9am', 'max_wind_speed_9am',
       'rain_accumulation_9am', 'rain_duration_9am'],
      dtype='object')
columns in Y: Index(['high_humidity_label'], dtype='object')


In [32]:
#perform test and train split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.33, random_state=324)
print("X_train is as under:")
print(X_train.head())
print("X_test is as under:")
print(X_test.head())
print("Y_train is as under:")
print(Y_train.head())
print("Y_test is as under:")
print(Y_test.head())

X_train is as under:
     air_pressure_9am  air_temp_9am  avg_wind_direction_9am  \
841        918.370000     72.932000              184.500000   
75         920.100000     53.492000              186.100000   
95         927.610000     54.896000               55.000000   
895        919.235153     65.951112              194.343333   
699        919.888128     68.687822              228.517730   

     avg_wind_speed_9am  max_wind_direction_9am  max_wind_speed_9am  \
841            2.013246              186.700000            2.773806   
75            13.444009              193.800000           15.367778   
95             4.988376               53.400000            7.202947   
895            2.942019              216.569792            3.658810   
699            3.960858              247.954028            5.185547   

     rain_accumulation_9am  rain_duration_9am  
841                    0.0                0.0  
75                     0.0                0.0  
95                     0.0   

In [33]:
print("let us describe y_train")
Y_train.describe()

let us describe y_train


Unnamed: 0,high_humidity_label
count,712.0
mean,0.494382
std,0.50032
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [34]:
#fit on train set
humidity_classifier=DecisionTreeClassifier(max_leaf_nodes=10,random_state=0)
humidity_classifier.fit(X_train,Y_train)
type(humidity_classifier)

sklearn.tree._classes.DecisionTreeClassifier

In [35]:
#predict on test set
predictions = humidity_classifier.predict(X_test)
print("sample predictions:\n",predictions[:10])
print("sample Y test(Actual data):\n",Y_test["high_humidity_label"][:10])


sample predictions:
 [0 0 1 1 1 1 0 0 0 1]
sample Y test(Actual data):
 456     0
845     0
693     1
259     1
723     1
224     1
300     1
442     0
585     1
1057    1
Name: high_humidity_label, dtype: int32


In [45]:
from sklearn.metrics import accuracy_score

# Measure Accuracy of the Classifier
print("Accuracy: \n", accuracy_score(Y_true=Y_test, Y_pred=predictions))


TypeError: missing a required argument: 'y_true'