In [75]:
import pandas as pd
import numpy as np

# Load the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Add TAVG column with NaN values to test dataframe
test['TAVG'] = np.nan

# Combine the rows of test and train
combined = pd.concat([train, test], ignore_index=True)

# Convert DATE into day, month, and year columns
combined['DATE'] = pd.to_datetime(combined['DATE'],dayfirst=True)
combined['Day'] = combined['DATE'].dt.day
combined['Month'] = combined['DATE'].dt.month
combined['Year'] = combined['DATE'].dt.year

# Sort rows in ascending order based on date
combined = combined.sort_values(by='DATE')

# Find the rows where PRCP_A has missing values and set its value to 8% of PRCP_B
combined.loc[combined['PRCP_A'].isna(), 'PRCP_A'] = 0.08 * combined['PRCP_B']

# Find the rows where PRCP_C has missing values and set its value to 8% of PRCP_B
combined.loc[combined['PRCP_C'].isna(), 'PRCP_C'] = 0.08 * combined['PRCP_B']

# Sort rows based on the month
combined = combined.sort_values(by='Month')


# Create another column MAVG_A whose rows have value 1 wherever TAVG_A was missing
combined['MAVG_A'] = np.where(combined['TAVG_A'].isna(), 1, 0)
# Create another column MAVG_A whose rows have value 1 wherever TAVG_A was missing
combined['MAVG_B'] = np.where(combined['TAVG_B'].isna(), 1, 0)
# Create another column MAVG_A whose rows have value 1 wherever TAVG_A was missing
combined['MAVG_C'] = np.where(combined['TAVG_C'].isna(), 1, 0)
# Fill missing TAVG_A values with the average value of the corresponding month's TAVG_A
combined['TAVG_A'] = combined.groupby('Month')['TAVG_A'].transform(lambda x: x.fillna(x.mean()))
# Fill missing TAVG_B values with the average value of the corresponding month's TAVG_B
combined['TAVG_B'] = combined.groupby('Month')['TAVG_B'].transform(lambda x: x.fillna(x.mean()))
# Fill missing TAVG_C values with the average value of the corresponding month's TAVG_C
combined['TAVG_C'] = combined.groupby('Month')['TAVG_C'].transform(lambda x: x.fillna(x.mean()))

# Create another column MAVG_A whose rows have value 1 wherever TAVG_A was missing
combined['MMAX_A'] = np.where(combined['TMAX_A'].isna(), 1, 0)
# Create another column MAVG_A whose rows have value 1 wherever TAVG_A was missing
combined['MMAX_B'] = np.where(combined['TMAX_B'].isna(), 1, 0)
# Create another column MAVG_A whose rows have value 1 wherever TAVG_A was missing
combined['MMAX_C'] = np.where(combined['TMAX_C'].isna(), 1, 0)
# Fill missing TMAX_A values with the average value of the corresponding month's TMAX_A
combined['TMAX_A'] = combined.groupby('Month')['TMAX_A'].transform(lambda x: x.fillna(x.mean()))
# Fill missing TMAX_B values with the average value of the corresponding month's TMAX_B
combined['TMAX_B'] = combined.groupby('Month')['TMAX_B'].transform(lambda x: x.fillna(x.mean()))
# Fill missing TMAX_C values with the average value of the corresponding month's TMAX_C
combined['TMAX_C'] = combined.groupby('Month')['TMAX_C'].transform(lambda x: x.fillna(x.mean()))


# Create another column MAVG_A whose rows have value 1 wherever TAVG_A was missing
combined['MMIN_A'] = np.where(combined['TMIN_A'].isna(), 1, 0)
# Create another column MAVG_A whose rows have value 1 wherever TAVG_A was missing
combined['MMIN_B'] = np.where(combined['TMIN_B'].isna(), 1, 0)
# Create another column MAVG_A whose rows have value 1 wherever TAVG_A was missing
combined['MMIN_C'] = np.where(combined['TMIN_C'].isna(), 1, 0)
# Fill missing TAVG_A values with the average value of the corresponding month's TAVG_A
combined['TMIN_A'] = combined.groupby('Month')['TMIN_A'].transform(lambda x: x.fillna(x.mean()))
# Fill missing TAVG_B values with the average value of the corresponding month's TAVG_B
combined['TMIN_B'] = combined.groupby('Month')['TMIN_B'].transform(lambda x: x.fillna(x.mean()))
# Fill missing TAVG_C values with the average value of the corresponding month's TAVG_C
combined['TMIN_C'] = combined.groupby('Month')['TMIN_C'].transform(lambda x: x.fillna(x.mean()))



# Find rows with non-zero PRCP_A and check if the average of TMAX_A, TMAX_B, TMAX_C is less than 0.5, if yes change PRCP_A to 0
mask_prcp = combined['PRCP_A'] != 0
avg_tmax = combined[['TMAX_A', 'TMAX_B', 'TMAX_C']].mean(axis=1)
combined.loc[mask_prcp & (avg_tmax < 0.5), 'PRCP_A'] = 0

mask_prcp = combined['PRCP_B'] != 0
avg_tmax = combined[['TMAX_A', 'TMAX_B', 'TMAX_C']].mean(axis=1)
combined.loc[mask_prcp & (avg_tmax < 0.5), 'PRCP_B'] = 0

mask_prcp = combined['PRCP_C'] != 0
avg_tmax = combined[['TMAX_A', 'TMAX_B', 'TMAX_C']].mean(axis=1)
combined.loc[mask_prcp & (avg_tmax < 0.5), 'PRCP_C'] = 0



# Find rows with non-zero SNWD_A and check if the average of TMIN_A, TMIN_B, TMIN_C is more than 0.5, if yes change SNWD_A to 0
mask_snwd = combined['SNWD_A'] != 0
avg_tmin = combined[['TMIN_A', 'TMIN_B', 'TMIN_C']].mean(axis=1)
combined.loc[mask_snwd & (avg_tmin > 0.5), 'SNWD_A'] = 0

mask_snwd = combined['SNWD_B'] != 0
avg_tmin = combined[['TMIN_A', 'TMIN_B', 'TMIN_C']].mean(axis=1)
combined.loc[mask_snwd & (avg_tmin > 0.5), 'SNWD_B'] = 0

mask_snwd = combined['SNWD_C'] != 0
avg_tmin = combined[['TMIN_A', 'TMIN_B', 'TMIN_C']].mean(axis=1)
combined.loc[mask_snwd & (avg_tmin > 0.5), 'SNWD_C'] = 0

# Save the result to a new file
combined.to_csv('processed_combined_two.csv', index=False)


In [83]:
import pandas as pd
import numpy as np

# Read the CSV data
df = pd.read_csv('processed_combined_two.csv')



# Calculate the 3-day moving average for TAVG_A
df['TMAX_A_MA'] = df['TMAX_A'].rolling(window=3, center=True).mean()

# Function to identify and fix outliers using moving average method
def fix_outliers(row):
    if pd.notnull(row['TMAX_A_MA']):
        diff = abs(row['TMAX_A'] - row['TMAX_A_MA'])
        threshold = 12  # You can adjust this threshold as needed

        if diff > threshold:
            return (row['TMAX_B'] + row['TMAX_C']) / 2

    return row['TMAX_A']

# Apply the outlier detection and correction
df['TMAX_A_CORRECTED'] = df.apply(fix_outliers, axis=1)

# Replace the original TAVG_A with the corrected values
df['TMAX_A'] = df['TMAX_A_CORRECTED']

# Drop the temporary columns
df = df.drop(['TMAX_A_MA', 'TMAX_A_CORRECTED'], axis=1)




# Calculate the 3-day moving average for TAVG_A
df['TMAX_B_MA'] = df['TMAX_B'].rolling(window=3, center=True).mean()

# Function to identify and fix outliers using moving average method
def fix_outliers(row):
    if pd.notnull(row['TMAX_B_MA']):
        diff = abs(row['TMAX_B'] - row['TMAX_B_MA'])
        threshold = 12  # You can adjust this threshold as needed

        if diff > threshold:
            return (row['TMAX_A'] + row['TMAX_C']) / 2

    return row['TMAX_B']

# Apply the outlier detection and correction
df['TMAX_B_CORRECTED'] = df.apply(fix_outliers, axis=1)

# Replace the original TAVG_A with the corrected values
df['TMAX_B'] = df['TMAX_B_CORRECTED']

# Drop the temporary columns
df = df.drop(['TMAX_B_MA', 'TMAX_B_CORRECTED'], axis=1)







# Calculate the 3-day moving average for TAVG_A
df['TMAX_C_MA'] = df['TMAX_C'].rolling(window=3, center=True).mean()

# Function to identify and fix outliers using moving average method
def fix_outliers(row):
    if pd.notnull(row['TMAX_C_MA']):
        diff = abs(row['TMAX_C'] - row['TMAX_C_MA'])
        threshold = 12  # You can adjust this threshold as needed

        if diff > threshold:
            return (row['TMAX_A'] + row['TMAX_B']) / 2

    return row['TMAX_C']

# Apply the outlier detection and correction
df['TMAX_C_CORRECTED'] = df.apply(fix_outliers, axis=1)

# Replace the original TAVG_A with the corrected values
df['TMAX_C'] = df['TMAX_C_CORRECTED']

# Drop the temporary columns
df = df.drop(['TMAX_C_MA', 'TMAX_C_CORRECTED'], axis=1)




# Calculate the 3-day moving average for TAVG_A
df['TAVG_A_MA'] = df['TAVG_A'].rolling(window=3, center=True).mean()

# Function to identify and fix outliers using moving average method
def fix_outliers(row):
    if pd.notnull(row['TAVG_A_MA']):
        diff = abs(row['TAVG_A'] - row['TAVG_A_MA'])
        threshold = 12  # You can adjust this threshold as needed

        if diff > threshold:
            return (row['TAVG_B'] + row['TAVG_C']) / 2

    return row['TAVG_A']

# Apply the outlier detection and correction
df['TAVG_A_CORRECTED'] = df.apply(fix_outliers, axis=1)

# Replace the original TAVG_A with the corrected values
df['TAVG_A'] = df['TAVG_A_CORRECTED']

# Drop the temporary columns
df = df.drop(['TAVG_A_MA', 'TAVG_A_CORRECTED'], axis=1)




# Calculate the 3-day moving average for TAVG_A
df['TAVG_B_MA'] = df['TAVG_B'].rolling(window=3, center=True).mean()

# Function to identify and fix outliers using moving average method
def fix_outliers(row):
    if pd.notnull(row['TAVG_B_MA']):
        diff = abs(row['TAVG_B'] - row['TAVG_B_MA'])
        threshold = 12  # You can adjust this threshold as needed

        if diff > threshold:
            return (row['TAVG_A'] + row['TAVG_C']) / 2

    return row['TAVG_B']

# Apply the outlier detection and correction
df['TAVG_B_CORRECTED'] = df.apply(fix_outliers, axis=1)

# Replace the original TAVG_A with the corrected values
df['TAVG_B'] = df['TAVG_B_CORRECTED']

# Drop the temporary columns
df = df.drop(['TAVG_B_MA', 'TAVG_B_CORRECTED'], axis=1)







# Calculate the 3-day moving average for TAVG_A
df['TAVG_C_MA'] = df['TAVG_C'].rolling(window=3, center=True).mean()

# Function to identify and fix outliers using moving average method
def fix_outliers(row):
    if pd.notnull(row['TAVG_C_MA']):
        diff = abs(row['TAVG_C'] - row['TAVG_C_MA'])
        threshold = 12  # You can adjust this threshold as needed

        if diff > threshold:
            return (row['TAVG_A'] + row['TAVG_B']) / 2

    return row['TAVG_C']

# Apply the outlier detection and correction
df['TAVG_C_CORRECTED'] = df.apply(fix_outliers, axis=1)

# Replace the original TAVG_A with the corrected values
df['TAVG_C'] = df['TAVG_C_CORRECTED']

# Drop the temporary columns
df = df.drop(['TAVG_C_MA', 'TAVG_C_CORRECTED'], axis=1)


















# Save the corrected data
df.to_csv('yamma_yamma.csv', index=False)

print("Outliers have been identified and corrected using the moving average method. The updated data has been saved to 'processed_combined_two_corrected.csv'.")

Outliers have been identified and corrected using the moving average method. The updated data has been saved to 'processed_combined_two_corrected.csv'.
