In [30]:
import numpy as np
import pandas as pd
import os
import pickle

import datetime
import random

In [2]:
df = pd.read_csv("./data/diamonds.csv")

In [3]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [5]:
df.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,0.799444,61.753006,57.45783,3944.80544,5.734403,5.737956,3.541056
std,0.475173,1.431088,2.232092,3997.938105,1.123077,1.145579,0.707065
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,951.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2410.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5351.0,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [7]:
def generate_synthetic_data(data, num_entries=5000):

    synthetic_data = {}
    categorical_columns = ['cut', 'color', 'clarity']

    for column in df.columns:
        if column in categorical_columns:
            # For categorical columns, sample values based on the distribution in the original data
            category_distribution = df[column].value_counts(normalize=True)
            synthetic_data[column] = np.random.choice(category_distribution.index, num_entries, p=category_distribution.values)
        else:
            # For numerical columns, calculate statistics and generate synthetic values
            column_data = df[column]
            column_stats = {
                "mean": column_data.mean() * np.random.choice([0.85, 1.15]) if column == "carat" else column_data.mean(),
                "std": column_data.std(),
                "min": column_data.min(),
                "max": column_data.max()
            }
            
            synthetic_data[column] = np.random.normal(column_stats["mean"], column_stats["std"], num_entries)
            synthetic_data[column] = np.clip(synthetic_data[column], column_stats["min"], column_stats["max"])

    # Create a DataFrame from the synthetic data
    synthetic_df = pd.DataFrame(synthetic_data)

    return synthetic_df

    




    

In [8]:
items = ['march', 'april', 'may']

# Create a directory for storing the data
#data_directory = 'data'
#os.makedirs(data_directory, exist_ok=True)

# Loop through the items
for item in items:
    # Generate synthetic data for the current item
    df_item = generate_synthetic_data(df)
    
    # Save the DataFrame using pickle
    pickle_filename = os.path.join("./data", f'df_{item}.pkl')
    with open(pickle_filename, 'wb') as pickle_file:
        pickle.dump(df_item, pickle_file)
    
    print(f"Saved DataFrame for {item} to {pickle_filename}")

Saved DataFrame for march to ./data/df_march.pkl
Saved DataFrame for april to ./data/df_april.pkl
Saved DataFrame for may to ./data/df_may.pkl


In [9]:
with open('data/df_march.pkl' , 'rb') as pickle_file:
    df_march = pickle.load(pickle_file)

df_march.head(20)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.240666,Very Good,G,VS1,58.968848,59.262789,326.0,7.34373,7.027,3.312833
1,0.670108,Very Good,D,VS1,63.161015,55.405463,326.0,5.842526,6.095739,4.541541
2,0.780663,Premium,H,SI1,63.120326,54.104004,2493.715435,5.934459,5.284436,3.052986
3,1.140031,Ideal,I,VS2,62.148713,59.18147,4495.966274,5.029023,5.051308,2.564967
4,0.427246,Ideal,I,VVS2,61.053272,56.705145,4530.709494,5.338554,4.984665,3.640142
5,0.2,Premium,G,SI1,60.328204,57.234892,12535.73395,5.477401,6.218061,3.15513
6,0.422652,Fair,F,VS1,63.906303,57.680978,5464.406452,3.672872,6.105124,4.640326
7,0.969146,Ideal,G,SI2,60.933681,59.020939,326.0,6.369386,4.740226,2.578971
8,0.2,Ideal,H,VS2,60.2816,53.258181,7507.522573,5.790178,3.903756,3.789688
9,0.468132,Very Good,F,VS2,62.135648,57.010118,1946.439654,6.63164,5.514759,3.299592


In [10]:
df.carat.mean()

0.7994437999999999

In [11]:
df_march.carat.mean()

0.7127894562115562

In [12]:
with open('data/df_april.pkl' , 'rb') as pickle_file:
    df_april = pickle.load(pickle_file)

df_april.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.516645,Ideal,F,VS1,63.224835,55.414947,5337.446918,5.980846,5.396138,3.448404
1,0.572623,Very Good,E,SI1,59.902055,59.016616,1997.006595,6.426172,7.245021,4.187258
2,0.577685,Ideal,H,VS2,61.125985,55.271342,4438.271644,5.043707,6.630305,3.139742
3,1.130176,Very Good,H,SI1,61.322514,58.906464,9350.150961,6.776027,3.971966,3.405
4,0.662716,Good,D,VS2,62.590775,55.910938,5420.946504,4.981277,5.495147,2.616613


In [13]:
df_april.carat.mean()

0.7231862938534532

In [14]:
with open('data/df_may.pkl' , 'rb') as pickle_file:
    df_may = pickle.load(pickle_file)

df_may.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.375632,Premium,E,SI2,61.216611,58.520365,6647.947013,7.04765,3.953273,4.544616
1,1.547515,Ideal,F,VVS2,61.812195,52.894064,15592.453815,5.324176,5.77163,3.778284
2,0.582045,Ideal,I,VVS2,62.252235,59.188428,6637.133011,5.840766,4.448582,3.038797
3,1.4395,Ideal,G,VVS2,60.745982,61.262146,2827.93998,4.983051,3.591145,3.403229
4,0.2,Very Good,E,VVS1,60.278692,55.978424,3089.789291,6.609352,5.96646,3.566855


In [15]:
df_may.carat.mean()

0.7149263514974368

In [16]:
df.carat.mean()

0.7994437999999999

In [32]:
from datetime import datetime, timedelta

In [33]:
def add_date_column(df, month=1, year=2022):
    # Number of entries in your DataFrame
    existing_df = df.copy()
    num_entries = len(existing_df)

    # Define the date range
    start_date = datetime(year, month, 1)
    #end_date = start_date + timedelta(months=1)
    end_date = datetime(year, month , 31) + timedelta(days=1)
    
    # Calculate the interval between dates to cover the entire date range
    interval = (end_date - start_date) / (num_entries - 1)
    
    # Generate dates covering the entire range, including the end date
    fake_dates = [start_date + i * interval for i in range(num_entries)]
    
    # Adjust last date to be the end_date
    #fake_dates[-1] = end_date
    
    # Add randomness to date distribution
    random.shuffle(fake_dates)
    
    # Sort the dates
    fake_dates.sort()
    
    # Add the generated dates to the existing DataFrame
    existing_df['date'] = fake_dates

    return existing_df

In [34]:
df_may_2 = add_date_column(df_may, month=5, year=2022)

In [35]:
df_may_2

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,date
0,0.375632,Premium,E,SI2,61.216611,58.520365,6647.947013,7.047650,3.953273,4.544616,2022-05-01 00:00:00.000000
1,1.547515,Ideal,F,VVS2,61.812195,52.894064,15592.453815,5.324176,5.771630,3.778284,2022-05-01 00:08:55.787157
2,0.582045,Ideal,I,VVS2,62.252235,59.188428,6637.133011,5.840766,4.448582,3.038797,2022-05-01 00:17:51.574314
3,1.439500,Ideal,G,VVS2,60.745982,61.262146,2827.939980,4.983051,3.591145,3.403229,2022-05-01 00:26:47.361471
4,0.200000,Very Good,E,VVS1,60.278692,55.978424,3089.789291,6.609352,5.966460,3.566855,2022-05-01 00:35:43.148628
...,...,...,...,...,...,...,...,...,...,...,...
4995,0.613172,Fair,F,SI2,60.422473,58.216722,9266.548817,6.330237,7.250225,2.655026,2022-05-31 23:24:16.849215
4996,1.143782,Ideal,I,SI1,60.777618,55.582045,326.000000,7.182579,6.119178,3.613266,2022-05-31 23:33:12.636372
4997,1.050558,Premium,J,SI1,63.721081,56.859048,8308.743826,5.452675,5.793473,3.056624,2022-05-31 23:42:08.423529
4998,1.033827,Ideal,F,SI1,61.666114,58.242220,18714.179426,5.629926,4.584655,2.819902,2022-05-31 23:51:04.210686


In [37]:
df_may_2.to_parquet("./data/may_2022.parquet")

In [38]:
!pwd

/Users/isaachurwitz/my-project/diamond_monitoring


In [68]:
# adding some missing features to the synthtic data

def add_missing_features(df, missing_prob=0.2):
    df = df.copy()
    num_features = ['carat', 'depth', 'table', 'x', 'y', 'z']

    for col in num_features:
        for row in range(int(df.shape[0] * 0.6)):
            if random.random() < 0.1:
                df.at[row, col] = np.nan
        for row in range(int(df.shape[0] * 0.6), df.shape[0]):
            if random.random() < 0.35:
                df.at[row, col] = np.nan

    return df



In [69]:
df_may_2.isna().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
date       0
dtype: int64

In [70]:
df_may_3 = add_missing_features(df_may_2, missing_prob=0.3)

In [71]:
df_may_3.isna().sum()

carat      1038
cut           0
color         0
clarity       0
depth      1035
table      1055
price         0
x           990
y          1020
z          1005
date          0
dtype: int64

In [72]:
df_may_3.to_parquet("./data/may_2022.parquet")

In [65]:
a = pd.read_parquet("./data/may_2022.parquet")

In [66]:
a

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,date
0,0.375632,Premium,E,SI2,61.216611,58.520365,6647.947013,7.047650,3.953273,4.544616,2022-05-01 00:00:00.000000
1,1.547515,Ideal,F,VVS2,61.812195,52.894064,15592.453815,5.324176,5.771630,3.778284,2022-05-01 00:08:55.787157
2,0.582045,Ideal,I,VVS2,62.252235,59.188428,6637.133011,5.840766,4.448582,3.038797,2022-05-01 00:17:51.574314
3,1.439500,Ideal,G,VVS2,60.745982,61.262146,2827.939980,4.983051,3.591145,3.403229,2022-05-01 00:26:47.361471
4,0.200000,Very Good,E,VVS1,60.278692,55.978424,3089.789291,6.609352,5.966460,3.566855,2022-05-01 00:35:43.148628
...,...,...,...,...,...,...,...,...,...,...,...
4995,0.613172,Fair,F,SI2,,58.216722,9266.548817,6.330237,,2.655026,2022-05-31 23:24:16.849215
4996,1.143782,Ideal,I,SI1,,,326.000000,7.182579,6.119178,,2022-05-31 23:33:12.636372
4997,1.050558,Premium,J,SI1,63.721081,56.859048,8308.743826,5.452675,5.793473,,2022-05-31 23:42:08.423529
4998,,Ideal,F,SI1,,58.242220,18714.179426,,4.584655,,2022-05-31 23:51:04.210686


In [52]:
for i in range(9, 10):
    print(i)

9
