# Importing Packages

In [None]:
import pandas as pd
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
import plotly.graph_objects as go

# Loading Meteo data

In [None]:
#If local data
#put the folder of the dataset in the same folder as the code with the name meteo_data

Q1 = pd.read_csv('meteo_data/LC_2022Q1.csv')
Q2 = pd.read_csv('meteo_data/LC_2022Q2.csv')
Q3 = pd.read_csv('meteo_data/LC_2022Q3.csv')
Q4 = pd.read_csv('meteo_data/LC_2022Q4.csv')

In [None]:
'''#If google drive
# Set up a project in the Google API Console and enable the Google Drive API.

# Install the PyDrive library using pip: `pip install PyDrive`

# Authenticate with your Google account.
gauth = GoogleAuth()
gauth.LocalWebserverAuth()  # Follow the instructions to authenticate with your Google account

# Step 4: Use the PyDrive client to retrieve the folder or file ID of the data you want to access.
drive = GoogleDrive(gauth)

# specify folders links
MeteoLink = "https://drive.google.com/drive/folders/1KznKVQzrCpRLgXyXsI2Yz0o2xzqO2CZo?usp=share_link"

# getting folders id
MeteoId = MeteoLink.split('/')[-1]

# getting folder by id
meteo = drive.CreateFile({'id': MeteoId})
meteo.FetchMetadata()

# getting files in the folders
MeteoFileList = drive.ListFile({'q': "'%s' in parents and trashed=false" % MeteoId}).GetList()
print (MeteoFileList[0])'''

# Cleaning Data

In [None]:
#Identifying missing values of Q1

Q1MissingValuesCount = Q1.isnull().sum()
Q1MissingValuesRatio = Q1.isnull().mean()
print(Q1MissingValuesRatio)


In [None]:
#dropping columns
Q1Drop = Q1.drop(['LC_DWPTEMP', 'LC_n','LC_RAD','LC_WINDDIR','LC_RAD60','LC_TEMP_QCL0','LC_TEMP_QCL1','LC_TEMP_QCL2'], axis=1)
hum = 'LC_HUMIDITY'
temp = 'LC_TEMP_QCL3'
rain = 'LC_RAININ'
dailyRain = 'LC_DAILYRAIN'
wind='LC_WINDSPEED'
imputer = KNNImputer(n_neighbors=1)

# create an instance of the StandardScaler
scaler = StandardScaler()

In [None]:
#filtering by location near the noise spots
id=Q1Drop['ID'].unique()
print(id)
Ids = ['LC-102', 'LC-117', 'LC-112', 'LC-118']
Q1DropIDfilter = Q1Drop.loc[Q1Drop['ID'].isin(Ids)]
print(Q1DropIDfilter['ID'].unique())

In [None]:
# normalize the data
Q1DropIDfilter[[hum,temp,rain,dailyRain,wind]] = scaler.fit_transform(Q1DropIDfilter[[hum,temp,rain,dailyRain,wind]])


In [None]:
# KNN transform
Q1DropIDfilter[[hum,temp,rain,dailyRain,wind]] = imputer.fit_transform(Q1DropIDfilter[[hum,temp,rain,dailyRain,wind]])


In [None]:
#Identifying missing values of Q2

Q2MissingValuesCount = Q2.isnull().sum()
Q2MissingValuesRatio = Q2.isnull().mean()
print(Q2MissingValuesRatio)


In [None]:
#identifying missing values in Q3

Q3MissingValuesCount = Q3.isnull().sum()
Q3MissingValuesRatio = Q3.isnull().mean()
print(Q3MissingValuesRatio)


In [None]:
#identifying missing values in Q4
Q4MissingValuesCount = Q4.isnull().sum()
Q4MissingValuesRatio = Q4.isnull().mean()
print(Q4MissingValuesRatio)

In [None]:
#merging the four Qs

year = pd.concat([Q1,Q2,Q3,Q4], ignore_index=True)
yearMissingValuesCount = year.isnull().sum()
yearMissingValuesRatio = year.isnull().mean()

print(yearMissingValuesRatio)

In [None]:
#dropping columns
yearDrop = year.drop(['LC_DWPTEMP', 'LC_n','LC_RAD','LC_WINDDIR','LC_RAD60','LC_TEMP_QCL0','LC_TEMP_QCL1','LC_TEMP_QCL2'], axis=1)

In [None]:
#filtering by location near the noise spots
id=yearDrop['ID'].unique()
print(id)
Ids = ['LC-102', 'LC-117', 'LC-112', 'LC-118']
yearDropIDfilter = yearDrop.loc[yearDrop['ID'].isin(Ids)]
print(yearDropIDfilter['ID'].unique())

# estimating the missing values with KNN

In [None]:

# normalize the data
yearDropIDfilter[[hum,temp,rain,dailyRain,wind]] = scaler.fit_transform(yearDropIDfilter[[hum,temp,rain,dailyRain,wind]])


In [None]:
# KNN transform
yearDropIDfilter[[hum,temp,rain,dailyRain,wind]] = imputer.fit_transform(yearDropIDfilter[[hum,temp,rain,dailyRain,wind]])


In [None]:

# convert the imputed data back to original scale
yearDrop[[hum,temp,rain,dailyRain,wind]] = scaler.inverse_transform(yearDrop[[hum,temp,rain,dailyRain,wind]])

In [None]:
# Saving the dataset of the year for the 4 stations in namsestrat with humidity, temperature, wind, rain and daily rain

print(yearDrop)
yearDrop.to_csv('YearProcessed.csv', index=True)

# Checking variance over time among the 4 stations

In [None]:
# Assuming you have a DataFrame named 'data' containing the weather measurements

# Step 1: Calculate the mean temperature, humidity, and wind for each date
mean_values = yearDrop.groupby('Date').mean()

# Step 2: Calculate the variance of temperature, humidity, and wind for each date
variance_values = yearDrop.groupby('Date').var()

# Step 3: Plot the variances over time
plt.plot(variance_values.index, variance_values[temp], label='Temperature Variance')
plt.plot(variance_values.index, variance_values[hum], label='Humidity Variance')
plt.plot(variance_values.index, variance_values[wind], label='Wind Variance')
plt.plot(variance_values.index, variance_values[rain], label='Rain Variance')
plt.plot(variance_values.index, variance_values[dailyRain], label='Daily rain Variance')


plt.xlabel('Date')
plt.ylabel('Variance')
plt.title('Variance of Weather Measurements')
plt.legend()
plt.show()

In [None]:
yearMissingValuesRemoved = year.dropna()
print(yearMissingValuesRemoved.isnull().sum())

In [None]:
plt.scatter(yearMissingValuesRemoved['Date'],yearMissingValuesRemoved['LC_TEMP_QCL0'], s=5)
plt.xlabel('Date')
plt.ylabel('TEMP_QCL0')
plt.show()

In [None]:
plt.scatter(yearMissingValuesRemoved['Date'],yearMissingValuesRemoved['LC_TEMP_QCL1'], s=5)
plt.xlabel('Date')
plt.ylabel('TEMP_QCL1')
plt.show()

In [None]:
plt.scatter(yearMissingValuesRemoved['Date'],yearMissingValuesRemoved['LC_TEMP_QCL2'], s=5)
plt.xlabel('Date')
plt.ylabel('TEMP_QCL2')
plt.show()

In [None]:
plt.scatter(yearMissingValuesRemoved['Date'],yearMissingValuesRemoved['LC_TEMP_QCL3'], s=5)
plt.xlabel('Date')
plt.ylabel('TEMP_QCL3')
plt.show()

In [None]:
#filtering by location near the noise spots
id=yearMissingValuesRemoved['ID'].unique()
print(id)
Ids = ['LC-102', 'LC-117', 'LC-112', 'LC-118']
FilteredByLocation = yearMissingValuesRemoved.loc[yearMissingValuesRemoved['ID'].isin(Ids)]
print(FilteredByLocation['ID'].unique())

In [None]:
temp = 'LC_TEMP_QCL3'
df_grouped = FilteredByLocation.groupby(['ID', 'Date'])[temp].mean().reset_index()

# create a pivot table with ID as rows, date as columns, and temperature as values
df_pivot = df_grouped.pivot(index='Date', columns='ID', values=temp)

# plot the data
df_pivot.plot(kind='line')
#plt.xlim(['2022-02-01','2022-03-01'])
plt.xlabel('Date')
plt.ylabel('Temperature')
plt.title('Temperature by ID and Date')
plt.show()

In [None]:
df_grouped1 = yearMissingValuesRemoved.groupby(['ID', 'Date'])[temp].mean().reset_index()

# create a pivot table with ID as rows, date as columns, and temperature as values
df_pivot1 = df_grouped1.pivot(index='Date', columns='ID', values=temp)

# plot the data
df_pivot1.plot(kind='line')
#plt.xlim(['2022-02-01','2022-03-01'])
plt.xlabel('Date')
plt.ylabel('Temperature')
plt.title('Temperature by ID and Date')
plt.legend('None')
plt.show()


In [None]:
# Assuming your DataFrame is called df
fig = go.Figure()

# Create a scatter plot for each ID
for id, data in yearMissingValuesRemoved.groupby('ID'):
    fig.add_trace(go.Scatter(x=data['Date'], y=data[temp], name=id))

# Add x and y axis labels
fig.update_xaxes(title='Date')
fig.update_yaxes(title='Temperature')
fig.update_layout(showlegend=False)
# Set the title
fig.update_layout(title='Temperature vs. Date for all IDs')

# Show the plot
fig.show()

In [None]:
Q1MissingValuesRemoved = Q1.dropna()
Q2MissingValuesRemoved = Q2.dropna()
Q3MissingValuesRemoved = Q3.dropna()
Q4MissingValuesRemoved = Q4.dropna()

In [None]:
plt.scatter(Q1MissingValuesRemoved['Date'],Q1MissingValuesRemoved['LC_HUMIDITY'],s=5)
plt.xlabel('Date')
plt.ylabel('Humidity')
plt.show()

## Loading Metadata

In [None]:
Metadata = pd.read_csv('meteo_data/01_Metadata_v2.csv')

In [None]:
print(Metadata.columns)
print(Metadata['SVF'].unique())

# Loading Noise data