In [None]:
import pandas as pd

# Load the dataset (replace 'your_data.csv' with your actual file path)
df = pd.read_csv('your_data.csv')

# Assuming the dataset has columns for date and temperature
# Convert the date column to datetime and set it as index
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

# Select data for a single location (if multiple locations are available)
# df = df[df['location'] == 'specific_location']

# Select one year of data for simplicity
df = df[df.index.year == 2020]  # Replace 2020 with your year of choice

# Handle missing values (simple imputation or deletion, depending on the dataset)
df['temperature'].fillna(method='ffill', inplace=True)


In [None]:
import matplotlib.pyplot as plt

# Plotting the temperature data
df['temperature'].plot(title='Daily Temperature')
plt.ylabel('Temperature')
plt.show()

# Histogram to understand distribution
df['temperature'].hist()
plt.title('Temperature Distribution')
plt.xlabel('Temperature')
plt.ylabel('Frequency')
plt.show()


In [None]:
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import numpy as np

# Splitting the data into train and test sets
train_size = int(len(df) * 0.8)
train, test = df[0:train_size], df[train_size:len(df)]

# Fit the ARIMA model
model = ARIMA(train['temperature'], order=(5,1,0)) # order can be adjusted
fitted_model = model.fit()

# Forecasting
forecast = fitted_model.forecast(steps=len(test))


In [None]:
# Compare the forecast with actual values
forecast_series = pd.Series(forecast, index=test.index)
plt.plot(train['temperature'], label='Train')
plt.plot(test['temperature'], label='Actual')
plt.plot(forecast_series, label='Forecast')
plt.title('Temperature Forecast vs Actuals')
plt.legend()
plt.show()

# Calculate Mean Squared Error
mse = mean_squared_error(test['temperature'], forecast_series)
print(f'Mean Squared Error: {mse}')
