In [22]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings('ignore')

In [23]:
data = pd.read_csv('temperature_humidity.csv')

print("Total number of samples: ", len(data))

print("Number of samples for each category: ")
print(data['humidity'].value_counts())

Total number of samples:  86400
Number of samples for each category: 
humidity
68.85    56
91.78    55
67.01    55
68.47    55
91.98    55
         ..
99.79     1
99.56     1
99.45     1
99.93     1
98.31     1
Name: count, Length: 3802, dtype: int64


In [24]:
data.drop(columns=['sensor_id', 'timestamp'], inplace=True)

print("Missing Values in each column: ")
missing_values = data.isnull().sum()
print(missing_values)

if missing_values.sum() > 0:
    missing_columns = missing_values[missing_values > 0].index.tolist()
    print("\n Columns with missing values: ", missing_columns)
    data = data.dropna()
    print("\n Dropped rows with missing values.")

Missing Values in each column: 
temperature      0
temperature_f    0
humidity         0
dtype: int64


In [25]:
# 4. Define features and target
X = data[['humidity']]
Y = data['temperature']
print(X)
print(Y)

       humidity
0         80.99
1         79.72
2         81.29
3         83.04
4         79.53
...         ...
86395     80.43
86396     78.90
86397     77.12
86398     79.42
86399     80.37

[86400 rows x 1 columns]
0        22.0
1        22.0
2        22.0
3        22.0
4        22.0
         ... 
86395    22.0
86396    22.0
86397    22.0
86398    22.0
86399    22.0
Name: temperature, Length: 86400, dtype: float64


In [26]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(X_scaled)

[[ 0.09764458]
 [-0.02802742]
 [ 0.12733088]
 ...
 [-0.28530866]
 [-0.05771371]
 [ 0.0362929 ]]


In [27]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, Y, test_size=0.2, random_state=42, stratify=Y
)

print("Training set size: ", len(X_train))
print("Testing set size: ", len(X_test))
print(X_train)
print(y_train)
print(X_test)
print(y_test)

Training set size:  69120
Testing set size:  17280
[[ 1.48597376]
 [ 1.20197485]
 [ 0.91302822]
 ...
 [ 0.81110526]
 [ 0.01650203]
 [-1.0502256 ]]
71622    15.84
56077    16.36
52359    17.67
24639    28.83
1650     22.84
         ...  
52580    17.59
42583    22.31
52132    17.77
2623     23.33
14066    27.98
Name: temperature, Length: 69120, dtype: float64
[[-1.01954976]
 [-1.08189098]
 [-0.48816503]
 ...
 [ 1.26728471]
 [ 1.27223242]
 [-1.57765215]]
13001    27.68
28874    28.04
37824    24.67
60440    15.35
64454    15.00
         ...  
80102    18.90
55378    16.58
67054    15.09
69820    15.46
29281    27.94
Name: temperature, Length: 17280, dtype: float64


In [30]:
# Train a Random Forest Classifier
clf = LinearRegression()
print(clf)
clf.fit(X_train, y_train)

LinearRegression()


In [34]:
# Evaluate model
y_pred = clf.predict(X_test)
print(y_pred)
# print("classification Report: ")
# print(classification_report(y_test, y_pred, target_names=le.classes_))

[26.95147577 27.25413477 24.37166808 ... 15.8491749  15.82515434
 29.66099445]


In [46]:
new_data = np.array([[22.5]])
new_data_scaled = scaler.transform(new_data)
# new_data = np.array([[140.18, 118.15, 144.89, 81.52]])
# new_data_scaled = scaler.transform(new_data)
# # Predict using the trained classifier
y_pred = clf.predict(new_data_scaled)
# # Decode the prodicted label back to the original category
# predicted_category = le.inverse_transform(y_pred)
print(y_pred)



[49.62688039]


ModuleNotFoundError: No module named 'matplotlib'

In [None]:
from matplotlib import pyplot as plt

plt.scatter(X_test, y_test, color='blue', label='Actual Data')
plt.scatter(X_test, y_pred, color='blue', label='Actual Data')