In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder

In [None]:
# Load the dataset with the specified date format
file_path = 'COTTON.csv'
data = pd.read_csv(file_path, parse_dates=['date'], dayfirst=True)

# Drop the unwanted columns
data = data.drop(columns=['arrivalquantity', 'cropid', 'cropname'])

In [None]:
data.info()

In [None]:
data.head()

In [None]:
# Convert the date column to datetime with the correct format
data['date'] = pd.to_datetime(data['date'], format='%d/%m/%Y')

In [None]:
# Extract additional features from the date
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day

In [None]:
# Create lag features
data = data.sort_values(by='date')
for lag in range(1, 8):  # Using past 7 days to predict the next day
    data[f'lag_{lag}'] = data['modalprice'].shift(lag)

In [None]:
# Drop rows with missing values created by lag features
data = data.dropna()

In [None]:
# Encode categorical variables
label_encoder = LabelEncoder()
data['mandiname'] = label_encoder.fit_transform(data['mandiname'])

In [None]:
# Define features and target
features = ['mandiname', 'year', 'month', 'day'] + [f'lag_{i}' for i in range(1, 8)]
target = 'modalprice'

In [None]:
X = data[features]
y = data[target]

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [None]:
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

In [None]:
# Predict future prices for the next 10 days
last_known = data[features].iloc[-1].values.reshape(1, -1)
future_predictions = []

for i in range(10):
    next_pred = model.predict(last_known)
    future_predictions.append(next_pred[0])
    
    # Update the last_known array with the new prediction and shift the lags
    last_known = np.roll(last_known, -1)
    last_known[0, -1] = next_pred

In [None]:
# Visualize the future predictions
future_dates = pd.date_range(start=data['date'].max() + pd.Timedelta(days=1), periods=10, freq='D')
future_df = pd.DataFrame({'date': future_dates, 'predicted_modalprice': future_predictions})

plt.figure(figsize=(14, 6))
plt.plot(future_df['date'], future_df['predicted_modalprice'], marker='o', linestyle='--', color='b')

# Add labels to each data point
for x, y in zip(future_df['date'], future_df['predicted_modalprice']):
    plt.text(x, y, f'{y:.2f}', ha='right', va='bottom', fontsize=14)

plt.title('Predicted COTTON Modal Price for the Next 10 Days')
plt.xlabel('Date')
plt.ylabel('Predicted Modal Price')
plt.xticks(rotation=0)
plt.grid(True)
plt.tight_layout()  # Adjust layout to prevent overlapping labels
plt.show()