In [None]:
import pandas as pd
import os

# Define file paths
raw_data_path = '../data/raw/'
processed_data_path = '../data/processed/'
train_csv_path = os.path.join(raw_data_path, 'train.csv')
store_csv_path = os.path.join(raw_data_path, 'store.csv')
merged_csv_path = os.path.join(processed_data_path, 'train_merged.csv')

# Create processed data directory if it doesn't exist
if not os.path.exists(processed_data_path):
    os.makedirs(processed_data_path)

# Load the datasets
try:
    train_df = pd.read_csv(train_csv_path, low_memory=False)
    store_df = pd.read_csv(store_csv_path)

    # Merge the dataframes
    # We use a left join to keep all records from the training set
    merged_df = pd.merge(train_df, store_df, on='Store', how='left')

    # Save the merged dataframe
    merged_df.to_csv(merged_csv_path, index=False)

    print(f'Successfully merged {train_csv_path} and {store_csv_path} into {merged_csv_path}')
    print('Shape of the merged dataframe:', merged_df.shape)
    print('First 5 rows of the merged dataframe:')
    print(merged_df.head())

except FileNotFoundError as e:
    print(f'Error: {e}')
    print('Please make sure the raw data files (`train.csv` and `store.csv`) exist in the `data/raw` directory.')


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the merged data
merged_csv_path = '../data/processed/train_merged.csv'
df = pd.read_csv(merged_csv_path, low_memory=False)

# Convert 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'])

print('Data loaded successfully.')
df.head()

### 1. Satışların Dağılımı

In [None]:
plt.style.use('ggplot')
plt.figure(figsize=(12, 6))
sns.histplot(df[df['Sales'] > 0]['Sales'], bins=50, kde=True)
plt.title('Distribution of Sales (Mağaza Açıkken)')
plt.xlabel('Sales')
plt.ylabel('Frequency')
plt.show()

### 2. Tatil Günlerinin Satışlara Etkisi

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 6))

# StateHoliday
sns.boxplot(x='StateHoliday', y='Sales', data=df, ax=ax1)
ax1.set_title('Sales vs. State Holiday')

# SchoolHoliday
sns.boxplot(x='SchoolHoliday', y='Sales', data=df, ax=ax2)
ax2.set_title('Sales vs. School Holiday')

plt.show()

### 3. Promosyonların Satışlara Etkisi

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='Promo', y='Sales', data=df)
plt.title('Sales vs. Promotion')
plt.show()

### 4. Kayıp Veri Analizi

In [None]:
missing_percentage = df.isnull().sum() / len(df) * 100
missing_percentage = missing_percentage[missing_percentage > 0].sort_values(ascending=False)

plt.figure(figsize=(12, 6))
missing_percentage.plot(kind='bar')
plt.title('Percentage of Missing Values by Column')
plt.ylabel('Percentage (%)')
plt.show()

print(missing_percentage)