In [4]:
import pandas as pd

def split_csv(file_path, chunk_size, output_prefix):
    # Read the large CSV file in chunks
    chunk_iter = pd.read_csv(file_path, chunksize=chunk_size)
    
    # Process each chunk and save it as a new CSV file
    for i, chunk in enumerate(chunk_iter):
        output_file = f"{output_prefix}_part_{i+1}.csv"
        chunk.to_csv(output_file, index=False)
        print(f"Saved {output_file}")

# Define the parameters
file_path = 'supervised_dataset.csv'
chunk_size = 10000  # Adjust the chunk size as needed
output_prefix = 'supervised_dataset_chunk'

# Split the CSV file into smaller chunks
split_csv(file_path, chunk_size, output_prefix)


Saved supervised_dataset_chunk_part_1.csv
Saved supervised_dataset_chunk_part_2.csv
Saved supervised_dataset_chunk_part_3.csv
Saved supervised_dataset_chunk_part_4.csv
Saved supervised_dataset_chunk_part_5.csv
Saved supervised_dataset_chunk_part_6.csv
Saved supervised_dataset_chunk_part_7.csv
Saved supervised_dataset_chunk_part_8.csv
Saved supervised_dataset_chunk_part_9.csv
Saved supervised_dataset_chunk_part_10.csv
Saved supervised_dataset_chunk_part_11.csv
Saved supervised_dataset_chunk_part_12.csv
Saved supervised_dataset_chunk_part_13.csv
Saved supervised_dataset_chunk_part_14.csv
Saved supervised_dataset_chunk_part_15.csv
Saved supervised_dataset_chunk_part_16.csv
Saved supervised_dataset_chunk_part_17.csv
Saved supervised_dataset_chunk_part_18.csv
Saved supervised_dataset_chunk_part_19.csv


In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import OneHotEncoder

file_path = 'supervised_dataset_chunk_part_1.csv'

# Load the dataset
data = pd.read_csv(file_path, low_memory=False)
data.head()

# Define the categorical and numerical columns
categorical_columns = ["Material number", "Supplier", "Contract", "Contract Position", "Procurement type", 
                       "Special procurement type", "Dispatcher", "Buyer", "Purchasing group", 
                       "Purchasing lot size", "Calendar", "Plant", "Plant information record", 
                       "Information record number", "Information record type",  "Product group",
                       "Base unit"]
numerical_columns = ["Fulfillment time", "Fixed contract 1", "Fixed contract 2", "Total quantity", "Total value", 
                     "Price unit", "Plant processing time", "Material master time"]

# Convert all categorical columns to strings to handle mixed types
for col in categorical_columns:
    data[col] = data[col].astype(str)

# Handling missing values by replacing them with the median for numerical columns
for col in numerical_columns:
    if data[col].isna().any():
        data[col].fillna(data[col].median(), inplace=True)

# One-hot encoding for categorical columns
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_categorical_data = encoder.fit_transform(data[categorical_columns])
encoded_categorical_df = pd.DataFrame(encoded_categorical_data, columns=encoder.get_feature_names_out(categorical_columns))

# Combine numerical and encoded categorical data
data_combined = pd.concat([data[numerical_columns], encoded_categorical_df], axis=1)

# Define the features (X) and the target (y)
X = data_combined
y = data['anomaly']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model for anomaly detection
xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
xgb_model.fit(X_train, y_train)

# Predict anomaly labels using XGBoost
y_pred = xgb_model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

# Find the top 5 columns with the most importance according to XGBoost
top_5_columns = xgb_model.feature_importances_.argsort()[-5:][::-1]

# Plot the distributions of the top 5 most important columns
for col_index in top_5_columns:
    column_name = X.columns[col_index]
    column_data = data[column_name].dropna()  # Drop NaN values from the column
    if not column_data.empty:
        plt.figure(figsize=(10, 6))
        plt.hist(column_data, bins=50, label=f'Distribution of {column_name}')
        plt.title(f'Distribution of {column_name}')
        plt.xlabel(column_name)
        plt.ylabel('Frequency')
        plt.legend()
        plt.savefig(f'{column_name}_distribution.png')
        plt.show()
    else:
        print(f"No data available for plotting {column_name}.")

# Save the detected anomaly column in the original DataFrame
data['detected_anomaly'] = xgb_model.predict(X)

# Save the original DataFrame with detected anomalies to a new CSV file
original_data_with_anomalies = data[['detected_anomaly'] + numerical_columns + list(encoded_categorical_df.columns)]  # Include numerical and encoded categorical columns
original_data_with_anomalies.to_csv('supervised_dataset_chunk_part_1_with_anomalies_xgboost.csv', index=False)


Confusion Matrix:
[[ 278    1]
 [   3 1718]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       279
           1       1.00      1.00      1.00      1721

    accuracy                           1.00      2000
   macro avg       0.99      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000


Accuracy Score:
0.998


KeyError: 'Plant_2100'

In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import OneHotEncoder
import dask.dataframe as dd

file_path = 'supervised_dataset.csv'
data = dd.read_csv('supervised_dataset.csv')


data = dd.from_pandas(pd.DataFrame(data), npartitions=1)

# Viewing initial data types
print(data.dtypes)

# Changing the data type of 'col1' to int32
data = data.astype({'Product group': 'object'})

# Define the categorical and numerical columns
categorical_columns = ["Material number", "Supplier", "Contract", "Contract Position", "Procurement type", 
                       "Special procurement type", "Dispatcher", "Buyer", "Purchasing group", 
                       "Purchasing lot size", "Calendar", "Plant", "Plant information record", 
                       "Information record number", "Information record type",  "Product group",
                       "Base unit"]
numerical_columns = ["Fulfillment time", "Fixed contract 1", "Fixed contract 2", "Total quantity", "Total value", 
                     "Price unit", "Plant processing time", "Material master time"]

# Convert all categorical columns to strings to handle mixed types
for col in categorical_columns:
    data[col] = data[col].astype(str)

# Handling missing values by replacing them with the median for numerical columns
for col in numerical_columns:
    if data[col].isna().any():
        data[col] = data[col].fillna(data[col].median())

# One-hot encoding for categorical columns
encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
encoded_categorical_data = encoder.fit_transform(data[categorical_columns])

# Convert encoded categorical data to DataFrame
encoded_categorical_df = pd.DataFrame(encoded_categorical_data, columns=encoder.get_feature_names_out(categorical_columns))

# Combine numerical and encoded categorical data
data_combined = dd.concat([data[numerical_columns], encoded_categorical_df], axis=1)

# Define the target (y)
y = data['anomaly']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_combined.compute(), y.compute(), test_size=0.2, random_state=42)

# Train XGBoost model for anomaly detection
xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
xgb_model.fit(X_train, y_train)

# Predict anomaly labels using XGBoost
y_pred = xgb_model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))


ValueError: Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.

+---------------+--------+----------+
| Column        | Found  | Expected |
+---------------+--------+----------+
| Product group | object | int64    |
+---------------+--------+----------+

The following columns also raised exceptions on conversion:

- Product group
  ValueError("invalid literal for int() with base 10: 'LEEB'")

Usually this is due to dask's dtype inference failing, and
*may* be fixed by specifying dtypes manually by adding:

dtype={'Product group': 'object'}

to the call to `read_csv`/`read_table`.

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("Stammdaten.csv", low_memory=False)

In [None]:
categorical_columns = ["Material number", "Supplier", "Contract", "Contract Position", "Procurement type", 
                                    "Special procurement type", "Dispatcher", "Buyer", "Purchasing group", 
                                    "Purchasing lot size", "Calendar", "Plant", "Plant information record", 
                                    "Information record number", "Information record type",  "Product group",
                                    "Base unit"]
numerical_columns = ["Fulfillment time", "Fixed contract 1", "Fixed contract 2", "Total quantity", "Total value", 
                                  "Price unit", "Plant processing time", "Material master time"]

In [None]:
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder



class Data_Preprocessing():
    def __init__(self, file_path) -> None:
        self.data = pd.read_csv(file_path, low_memory=False, nrows=3000)
        
        self.rename_to_english()
        
        print(self.data.columns)
        
        # Specify categorical and numerical numbers manually
        self.categorical_columns = ["Material number", "Supplier", "Contract", "Contract Position", "Procurement type", 
                                    "Special procurement type", "Dispatcher", "Buyer", "Purchasing group", 
                                    "Purchasing lot size", "Calendar", "Plant", "Plant information record", 
                                    "Information record number", "Information record type",  "Product group",
                                    "Base unit"]
        self.numerical_columns = ["Fulfillment time", "Fixed contract 1", "Fixed contract 2", "Total quantity", "Total value", 
                                  "Price unit", "Plant processing time", "Material master time"]
        
        # Initialize MinMaxScaler
        self.scaler = MinMaxScaler()
    
    def rename_to_english(self):
        self.data.rename(columns={"Materialnummer": "Material number", "Lieferant OB": "Supplier", "Vertrag OB": "Contract", 
                                 "Vertragsposition OB": "Contract Position", "Planlieferzeit Vertrag": "Fulfillment time", 
                                 "Vertrag Fix1": "Fixed contract 1", "Vertrag_Fix2": "Fixed contract 2", "Beschaffungsart": 
                                 "Procurement type", "Sonderbeschaffungsart": "Special procurement type", "Disponent":
                                 "Dispatcher", "Einkäufer": "Buyer", "DispoGruppe": "Purchasing group", "Dispolosgröße": 
                                 "Purchasing lot size", "Gesamtbestand": "Total quantity", "Gesamtwert": "Total value",
                                 "Preiseinheit": "Price unit", "Kalender": "Calendar", "Werk OB": "Plant", "Werk Infosatz":
                                 "Plant information record", "Infosatznummer": "Information record number", "Infosatztyp":
                                 "Information record type", "WE-Bearbeitungszeit": "Plant processing time", "Planlieferzeit Mat-Stamm":
                                 "Material master time", "Warengruppe": "Product group", "Basiseinheit": "Base unit"}, inplace=True)

    def normalize_data(self, numerical_data):
        # Fit and transform the data
        normalized_data = self.scaler.fit_transform(numerical_data)

        return normalized_data

    def preprocess_data(self):
        self.data[self.categorical_columns] = self.data[self.categorical_columns].astype('category')
        self.data[self.numerical_columns] = self.data[self.numerical_columns].astype('int64')


        # If Delivery time is 0, the value is missing
        self.data["Fulfillment time"] = self.data["Fulfillment time"].replace(0, np.nan)
        self.data["Material master time"] = self.data["Material master time"].replace(0, np.nan)

        # If processing time is 0, the value is missing
        self.data["Plant processing time"] = self.data["Plant processing time"].replace(0, np.nan)

        # If total quantity is 0, the value is missing
        self.data["Total quantity"] = self.data["Total quantity"].replace(0, np.nan)

        # If total value is 0, the toal value is not known or missing
        self.data["Total value"] = self.data["Total value"].replace(0, np.nan)

        self.data["Fixed contract 1"] = self.data["Fixed contract 1"].replace(0, np.nan)
        self.data["Fixed contract 2"] = self.data["Fixed contract 2"].replace(0, np.nan)


        return self.data
    

    def preprocess_dbscan(self, data):
        numerical_columns = ["Fulfillment time", "Fixed contract 1"]
        data = data[numerical_columns]

        # Remove rows with NaN values
        data_without_nan = data.dropna(axis=0)

        return data_without_nan
    
    def preprocess_data_kmean(self):
        """
        Preprocess the data by imputing missing values, performing one-hot encoding for categorical variables,
        and performing feature normalization.

        Parameters:
        - data: pandas DataFrame containing the dataset

        Returns:
        - processed_data: pandas DataFrame containing imputed missing values, one-hot encoded features, and normalized features
        """
        # Separate numeric and categorical columns
        # numeric_cols = data.select_dtypes(include=np.number).columns
        # categorical_cols = data.select_dtypes(include='object').columns

        #data = data[numerical_columns]

        self.data=self.preprocess_data()

        not_scaled_data = self.data.copy()

        # categorical_cols = ["Materialnummer", "Lieferant OB", "Vertragsposition OB", "Beschaffungsart", "Disponent", "Einkäufer", "Dispolosgröße", "Werk OB", "Warengruppe", "Basiseinheit"]
        # numeric_cols = ["Planlieferzeit Vertrag", "Vertrag Fix1", "Vertrag_Fix2", "Gesamtbestand", "Gesamtwert", "Preiseinheit", "WE-Bearbeitungszeit", "Planlieferzeit Mat-Stamm"]
        
        # Impute missing values using mean imputation for numeric columns
        # imputer = SimpleImputer(strategy='mean')
        # data_numeric_imputed = pd.DataFrame(imputer.fit_transform(self.data[self.numerical_columns]), columns=self.numerical_columns)

        # One-hot encode categorical variables
        if len(self.categorical_columns) > 0:
            encoder = OneHotEncoder(drop='first')
            data_encoded = encoder.fit_transform(self.data[self.categorical_columns])
            column_names = encoder.get_feature_names_out(self.categorical_columns)
            data_imputed_encoded = pd.DataFrame(data_encoded.toarray(), columns=column_names)
        else:
            data_imputed_encoded = pd.DataFrame()

        # Combine numeric and encoded categorical columns
        processed_data = pd.concat([self.data[self.numerical_columns], data_imputed_encoded], axis=1)


        # Normalize features
        scaler = StandardScaler()
        processed_data = pd.DataFrame(scaler.fit_transform(processed_data), columns=processed_data.columns)


        return processed_data, not_scaled_data

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from gensim.models import Word2Vec
import xgboost as xgb
import matplotlib.pyplot as plt
from scipy import stats

class Data_Preprocessing:
    def __init__(self, file_path) -> None:
        self.data = pd.read_csv(file_path, low_memory=False, nrows=3000)
        
        self.rename_to_english()
        
        print(self.data.columns)
        
        # Specify categorical and numerical numbers manually
        self.categorical_columns = ["Material number", "Supplier", "Contract", "Contract Position", "Procurement type", 
                                    "Special procurement type", "Dispatcher", "Buyer", "Purchasing group", 
                                    "Purchasing lot size", "Calendar", "Plant", "Plant information record", 
                                    "Information record number", "Information record type",  "Product group",
                                    "Base unit"]
        self.numerical_columns = ["Fulfillment time", "Fixed contract 1", "Fixed contract 2", "Total quantity", "Total value", 
                                  "Price unit", "Plant processing time", "Material master time"]
        
        # Initialize MinMaxScaler
        self.scaler = MinMaxScaler()
    
    def rename_to_english(self):
        self.data.rename(columns={"Materialnummer": "Material number", "Lieferant OB": "Supplier", "Vertrag OB": "Contract", 
                                 "Vertragsposition OB": "Contract Position", "Planlieferzeit Vertrag": "Fulfillment time", 
                                 "Vertrag Fix1": "Fixed contract 1", "Vertrag_Fix2": "Fixed contract 2", "Beschaffungsart": 
                                 "Procurement type", "Sonderbeschaffungsart": "Special procurement type", "Disponent":
                                 "Dispatcher", "Einkäufer": "Buyer", "DispoGruppe": "Purchasing group", "Dispolosgröße": 
                                 "Purchasing lot size", "Gesamtbestand": "Total quantity", "Gesamtwert": "Total value",
                                 "Preiseinheit": "Price unit", "Kalender": "Calendar", "Werk OB": "Plant", "Werk Infosatz":
                                 "Plant information record", "Infosatznummer": "Information record number", "Infosatztyp":
                                 "Information record type", "WE-Bearbeitungszeit": "Plant processing time", "Planlieferzeit Mat-Stamm":
                                 "Material master time", "Warengruppe": "Product group", "Basiseinheit": "Base unit"}, inplace=True)

    def preprocess_data(self):
        self.data[self.categorical_columns] = self.data[self.categorical_columns].astype('category')
        self.data[self.numerical_columns] = self.data[self.numerical_columns].astype('int64')


        # If Delivery time is 0, the value is missing
        self.data["Fulfillment time"] = self.data["Fulfillment time"].replace(0, np.nan)
        self.data["Material master time"] = self.data["Material master time"].replace(0, np.nan)

        # If processing time is 0, the value is missing
        self.data["Plant processing time"] = self.data["Plant processing time"].replace(0, np.nan)

        # If total quantity is 0, the value is missing
        self.data["Total quantity"] = self.data["Total quantity"].replace(0, np.nan)

        # If total value is 0, the toal value is not known or missing
        self.data["Total value"] = self.data["Total value"].replace(0, np.nan)

        self.data["Fixed contract 1"] = self.data["Fixed contract 1"].replace(0, np.nan)
        self.data["Fixed contract 2"] = self.data["Fixed contract 2"].replace(0, np.nan)


        return self.data

    def preprocess_data_kmean(self):
        self.data = self.preprocess_data()
        not_scaled_data = self.data.copy()

        if len(self.categorical_columns) > 0:
            encoder = OneHotEncoder(drop='first')
            data_encoded = encoder.fit_transform(self.data[self.categorical_columns])
            column_names = encoder.get_feature_names_out(self.categorical_columns)
            data_imputed_encoded = pd.DataFrame(data_encoded.toarray(), columns=column_names)
        else:
            data_imputed_encoded = pd.DataFrame()

        processed_data = pd.concat([self.data[self.numerical_columns], data_imputed_encoded], axis=1)
        scaler = StandardScaler()
        processed_data = pd.DataFrame(scaler.fit_transform(processed_data), columns=processed_data.columns)

        return processed_data, not_scaled_data

file_path = 'Stammdaten.csv'

Data = Data_Preprocessing(file_path=file_path)
data, not_processed_data = Data.preprocess_data_kmean()

# Handling missing values by replacing them with the median of each column
for col in Data.numerical_columns:
    if data[col].isna().any():
        data[col].fillna(data[col].median(), inplace=True)

# Applying Word2Vec for categorical columns
categorical_data = not_processed_data[Data.categorical_columns].astype(str).values.tolist()
w2v_model = Word2Vec(sentences=categorical_data, vector_size=100, window=5, min_count=1, workers=4, seed=42)
word_vectors = w2v_model.wv

# Create embeddings for each categorical feature
for col in Data.categorical_columns:
    not_processed_data[col + '_embedding'] = not_processed_data[col].apply(lambda x: word_vectors[x])

# Combine all embeddings into a single feature set
embedding_features = np.array(not_processed_data[[col + '_embedding' for col in Data.categorical_columns]].values.tolist())
embedding_features = embedding_features.reshape(len(not_processed_data), -1)

# Combine numerical and embedding features
X = np.hstack((not_processed_data[Data.numerical_columns].values, embedding_features))

# Applying Z-score for anomaly detection in numeric columns
for col in Data.numerical_columns:
    data[col + '_z_score'] = np.abs(stats.zscore(data[col]))
    data[col + '_outlier'] = 0
    data.loc[data[col + '_z_score'] > 3, col + '_outlier'] = 1

data['anomaly_label'] = data[[col + '_outlier' for col in Data.numerical_columns]].max(axis=1)

# Train XGBoost model for anomaly detection
xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
xgb_model.fit(X, data['anomaly_label'])

# Predict anomaly labels using XGBoost
data['xgb_anomaly'] = xgb_model.predict(X)

# Save the detected anomaly column in the original DataFrame
original_data = pd.read_csv(file_path)
original_data['detected_anomaly'] = data['xgb_anomaly']

# Save the original DataFrame with detected anomalies to a new CSV file
original_data.to_csv('updated_with_anomalies_xgboost.csv', index=False)


In [None]:
categorical_columns = ["Material number", "Supplier", "Contract", "Contract Position", "Procurement type", 
                                    "Special procurement type", "Dispatcher", "Buyer", "Purchasing group", 
                                    "Purchasing lot size", "Calendar", "Plant", "Plant information record", 
                                    "Information record number", "Information record type",  "Product group",
                                    "Base unit"]
numerical_columns = ["Fulfillment time", "Fixed contract 1", "Fixed contract 2", "Total quantity", "Total value", 
                                  "Price unit", "Plant processing time", "Material master time"]
        


file_path = 'supervised_dataset.csv'

data = pd.read_csv(file_path)
data.head()


import pandas as pd
import numpy as np
import xgboost as xgb
from scipy import stats
import matplotlib.pyplot as plt

# Handling missing values by replacing them with the median of each column
for col in numerical_columns:
    if data[col].isna().any():
        data[col].fillna(data[col].median(), inplace=True)

# Applying Z-score for anomaly detection in numeric columns
for col in numerical_columns:
    data[col + '_z_score'] = np.abs(stats.zscore(data[col]))
    data[col + '_outlier'] = 0
    data.loc[data[col + '_z_score'] > 3, col + '_outlier'] = 1  # Any Z-score > 3 is considered an outlier

# Combine all outlier flags to a single anomaly label
data['anomaly_label'] = data[[col + '_outlier' for col in numerical_columns]].max(axis=1)

# Train XGBoost model for anomaly detection
xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
xgb_model.fit(data[numerical_columns], data['anomaly_label'])

# Predict anomaly labels using XGBoost
data['xgb_anomaly'] = xgb_model.predict(data[numerical_columns])

# Find the top 5 columns with the most anomalies predicted by XGBoost
top_5_columns = xgb_model.feature_importances_.argsort()[-5:][::-1]

# Plot the distributions of the top 5 most anomalous columns
for col_index in top_5_columns:
    column_name = numerical_columns[col_index]
    column_data = data[column_name].dropna()  # Drop NaN values from the column
    if not column_data.empty:
        plt.figure(figsize=(10, 6))
        plt.hist(column_data, bins=50, label=f'Distribution of {column_name}')
        plt.title(f'Distribution of {column_name}')
        plt.xlabel(column_name)
        plt.ylabel('Frequency')
        plt.legend()
        plt.savefig(f'{column_name}_distribution.png')
        plt.show()
    else:
        print(f"No data available for plotting {column_name}.")


# Save the detected anomaly column in the original DataFrame
data['detected_anomaly'] = np.where(data['anomaly_label'] == 1, 1, 0)

# Save the original DataFrame with detected anomalies to a new CSV file
original_data_with_anomalies = data[['detected_anomaly'] + numerical_columns]  # Include only numerical columns
original_data_with_anomalies.to_csv('supervised_dataset_with_anomalies_xgboost.csv', index=False)



In [None]:
categorical_columns = ["Material number", "Supplier", "Contract", "Contract Position", "Procurement type", 
                                    "Special procurement type", "Dispatcher", "Buyer", "Purchasing group", 
                                    "Purchasing lot size", "Calendar", "Plant", "Plant information record", 
                                    "Information record number", "Information record type",  "Product group",
                                    "Base unit"]
numerical_columns = ["Fulfillment time", "Fixed contract 1", "Fixed contract 2", "Total quantity", "Total value", 
                                  "Price unit", "Plant processing time", "Material master time"]
        


file_path = 'Stammdaten.csv'

Data=Data_Preprocessing(file_path=file_path)
data, not_processed_data= Data.preprocess_data_kmean()
print(data)


import pandas as pd
import numpy as np
import xgboost as xgb
from scipy import stats
import matplotlib.pyplot as plt

# Encode categorical variables
#categorical_columns = data.select_dtypes(include=['object']).columns
#data[categorical_columns] = data[categorical_columns].apply(lambda col: pd.factorize(col)[0])


# Handling missing values by replacing them with the median of each column
# for col in numerical_columns:
#     if data[col].isna().any():
#         data[col].fillna(data[col].median(), inplace=True)

# Applying Z-score for anomaly detection in numeric columns

import pandas as pd
import numpy as np
from scipy import stats

# Assuming 'data' is your DataFrame containing numerical features

# Calculate mean and standard deviation for each numerical column
mean_values = data[numerical_columns].mean()
std_dev_values = data[numerical_columns].std()

# Calculate Z-scores for each numerical column
z_scores = (data[numerical_columns] - mean_values) / std_dev_values

# Optionally, you can add the Z-scores as new columns to the existing DataFrame
for col in numerical_columns:
    data[f'{col}_z_score'] = z_scores[col]


# Create a DataFrame to store the outlier flags
outlier_flags = pd.DataFrame()

# Apply outlier detection threshold (e.g., Z-score > 3) to identify outliers
for col in numerical_columns:
    outlier_flags[col + '_outlier'] = (z_scores[col] > 3).astype(int)

# Combine all outlier flags to a single anomaly label
data['anomaly_label'] = outlier_flags.max(axis=1)

# No need to drop outlier flags columns since they haven't been added yet
# Concatenate the anomaly label column with the input data
data = pd.concat([data, outlier_flags], axis=1)


# Combine all outlier flags to a single anomaly label
data['anomaly_label'] = data[[col + '_outlier' for col in numerical_columns]].max(axis=1)


features = data.drop(columns=[col + '_z_score' for col in numerical_columns])
# Remove columns related to anomaly detection from the input data

import xgboost as xgb
from sklearn.model_selection import train_test_split

# Assuming 'data' is your DataFrame containing numerical features and 'anomaly_label' is the binary target variable

# Split data into features and target variable
X = data.drop(columns=['anomaly_label'])
y = data['anomaly_label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model
xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
xgb_model.fit(X, y)

# Predict anomalies on the testing data
y_pred_proba = xgb_model.predict_proba(X)[:, 1]  # Probability of being an outlier
y_pred = xgb_model.predict(X)  # Binary prediction (0 or 1)

# You can now use y_pred_proba or y_pred for further analysis or evaluation


# You can now use y_pred_proba or y_pred for further analysis or evaluation

data_without_nan = data.dropna(subset=['Total quantity_z_score'])

print(data['anomaly_label'])

import matplotlib.pyplot as plt

# Define a function to create scatter plots for anomalies
def visualize_anomalies(data, anomaly_labels):
    # Scatter plot for "Total quantity" column
    plt.figure(figsize=(10, 6))
    plt.scatter(data['Total quantity'], data['Total value'], c=anomaly_labels, cmap='coolwarm')
    plt.xlabel('Total quantity')
    plt.ylabel('Total quantity')
    plt.title('Anomalies in Total Quantity')
    plt.colorbar(label='Anomaly Label')
    plt.show()

# Assuming you have already predicted anomaly labels for the dataset
# Replace 'anomaly_labels' with your actual anomaly labels
visualize_anomalies(data, y_pred)

# Visualize statistical outliers in one of the numeric columns
plt.figure(figsize=(10, 6))
plt.hist(data_without_nan['Total quantity_z_score'],bins=50,  label='Z-scores')
plt.axvline(3, color='red', linestyle='dashed', linewidth=2, label='Outlier Threshold')
plt.title('Histogram of Z-scores for Total Inventory')
plt.xlabel('Z-score')
plt.ylabel('Frequency')
plt.legend()
plt.show()

import seaborn as sns


# Define a function to create scatter plots for anomalies
# def visualize_anomalies2(data, anomaly_labels):
#     # Pairplot for all numerical features
#     sns.pairplot(data, hue=da'anomaly_labels', palette={0: 'blue', 1: 'red'})
#     plt.title('Scatter Plot of Numerical Features with Anomalies')
#     plt.show()

# # Visualize anomalies
# visualize_anomalies2(data[numerical_columns], y_pred)


# Visualize statistical outliers in one of the numeric columns
plt.figure(figsize=(10, 6))
plt.hist(data_without_nan['Total quantity_z_score'],  label='Z-scores')
plt.axvline(3, color='red', linestyle='dashed', linewidth=2, label='Outlier Threshold')
plt.title('Histogram of Z-scores for Total Inventory')
plt.xlabel('Z-score')
plt.ylabel('Frequency')
plt.legend()
plt.show()


# Find the top 5 columns with the most anomalies predicted by XGBoost
top_5_columns = xgb_model.feature_importances_.argsort()[-5:][::-1]

# Plot the distributions of the top 5 most anomalous columns
for col_index in top_5_columns:
    column_name = numerical_columns[col_index]
    plt.figure(figsize=(10, 6))
    plt.hist(data[column_name+'_z_score'], bins=50, label=f'Distribution of Z Score in {column_name}')
    plt.title(f'Distribution of {column_name}')
    plt.xlabel(column_name)
    plt.ylabel('Frequency')
    plt.legend()
    plt.savefig(f'{column_name}_distribution.png')
    plt.show()
# Save the updated dataset
data.to_csv('updated_with_anomalies_xgboost_stammdaten.csv', index=False)

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Load and preprocess your data
file_path = 'Stammdaten.csv'
# Assuming the Data_Preprocessing class is defined elsewhere that you can import
Data = Data_Preprocessing(file_path=file_path)
data, not_processed_data = Data.preprocess_data_kmean()

# Define numerical columns
numerical_columns = ["Fulfillment time", "Fixed contract 1", "Fixed contract 2", "Total quantity", "Total value", 
                                  "Price unit", "Plant processing time", "Material master time"]

# Handling missing values by replacing them with the median of each column
for col in numerical_columns:
    if data[col].isna().any():
        data[col].fillna(data[col].median(), inplace=True)

# Calculate mean and standard deviation for each numerical column
mean_values = data[numerical_columns].mean()
std_dev_values = data[numerical_columns].std()

# Calculate Z-scores for each numerical column
z_scores = (data[numerical_columns] - mean_values) / std_dev_values

# Optionally, you can add the Z-scores as new columns to the existing DataFrame
for col in numerical_columns:
    data[f'{col}_z_score'] = z_scores[col]

# Create an outlier flag for each Z-score
for col in numerical_columns:
    data[col + '_outlier'] = (np.abs(data[f'{col}_z_score']) > 3).astype(int)

# Combine all outlier flags to a single anomaly label
data['anomaly_label'] = data[[col + '_outlier' for col in numerical_columns]].max(axis=1)

# Split data into features and target variable for model fitting
X = data[numerical_columns]  # Use only the original numerical features for model training
y = data['anomaly_label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model
xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
xgb_model.fit(X_train, y_train)

# Predict anomalies on the testing data
y_pred = xgb_model.predict(X_test)

# Find the top 5 columns with the most anomalies predicted by XGBoost
top_5_columns = xgb_model.feature_importances_.argsort()[-5:][::-1]

# Plot the distributions of the top 5 most anomalous columns
for col_index in top_5_columns:
    column_name = numerical_columns[col_index]
    # Check if the column contains only NaN values
    if data[column_name].isna().all():
        print(f"Skipping plot for {column_name} as it contains only NaN values.")
        continue
    plt.figure(figsize=(10, 6))
    try:
        plt.hist(data[column_name].dropna(), bins=50, label=f'Distribution of {column_name}')  # Ensure to drop NaNs for plotting
        plt.title(f'Distribution of {column_name}')
        plt.xlabel(column_name)
        plt.ylabel('Frequency')
        plt.legend()
        plt.savefig(f'{column_name}_distribution.png')
        plt.show()
    except ValueError as e:
        print(f"Failed to plot {column_name} due to an error: {e}")


# Save the updated dataset
data.to_csv('updated_with_anomalies_xgboost_stammdaten.csv', index=False)


In [None]:
# Plot the scatter of the top 5 most anomalous columns with anomalies highlighted
for col_index in top_5_columns:
    column_name = numerical_columns[col_index]
    
    if data[column_name].isna().all():
        print(f"Skipping plot for {column_name} as it contains only NaN values.")
        continue

    plt.figure(figsize=(10, 6))
    normal_data = data[data['anomaly_label'] == 0]
    anomalies = data[data['anomaly_label'] == 1]

    if normal_data.empty or anomalies.empty:
        print(f"No sufficient data to plot for {column_name}.")
        continue

    plt.scatter(normal_data[column_name], normal_data[column_name], color='blue', label='Normal', alpha=0.5, edgecolors='w')
    plt.scatter(anomalies[column_name], anomalies[column_name], color='red', label='Anomaly', alpha=0.5, edgecolors='w')
    
    plt.title(f'Scatter Plot for {column_name}')
    plt.xlabel(column_name)
    plt.ylabel(column_name)  # Adjust as needed
    plt.legend()
    plt.grid(True)
    plt.savefig(f'{column_name}_anomalies_scatter.png')
    plt.show()


In [None]:
# Plot the distributions of the top 5 most anomalous columns with anomalies highlighted
for col_index in top_5_columns:
    column_name = numerical_columns[col_index]
    
    # Check if the column contains only NaN values
    if data[column_name].isna().all():
        print(f"Skipping plot for {column_name} as it contains only NaN values.")
        continue

    # Prepare the figure
    plt.figure(figsize=(10, 6))
    normal_data = data[data[anomaly_column] == 0]
    anomalies = data[data[anomaly_column] == 1]

    # Check that there are normal and anomalous points to plot
    if normal_data.empty or anomalies.empty:
        print(f"No sufficient data to plot for {column_name}.")
        continue

    # Plot normal points
    plt.scatter(normal_data[column_name], normal_data[column_name], color='blue', label='Normal', alpha=0.6, edgecolors='w')

    # Plot anomalies
    plt.scatter(anomalies[column_name], anomalies[column_name], color='red', label='Anomaly', alpha=0.6, edgecolors='w')

    # Additional plot formatting
    plt.title(f'Scatter Plot for {column_name}')
    plt.xlabel(column_name)
    plt.ylabel(column_name)  # Typically you might plot against another feature or itself with jitter
    plt.legend()
    plt.grid(True)
    plt.savefig(f'{column_name}_anomalies_scatter.png')
    plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from scipy import stats
import matplotlib.pyplot as plt

# Handling missing values by replacing them with the median of each column
for col in numerical_columns:
    if data[col].isna().any():
        data[col].fillna(data[col].median(), inplace=True)

# Applying Z-score for anomaly detection in numeric columns
for col in numerical_columns:
    data[col + '_z_score'] = np.abs(stats.zscore(data[col]))
    data[col + '_outlier'] = 0
    data.loc[data[col + '_z_score'] > 3, col + '_outlier'] = 1  # Any Z-score > 3 is considered an outlier

# Combine all outlier flags to a single anomaly label
data['anomaly_label'] = data[[col + '_outlier' for col in numerical_columns]].max(axis=1)

# Train Isolation Forest model for anomaly detection
if_model = IsolationForest(random_state=42)
if_model.fit(data[numerical_columns])

# Predict anomaly labels using Isolation Forest
data['if_anomaly'] = if_model.predict(data[numerical_columns])
data['if_anomaly'] = np.where(data['if_anomaly'] == -1, 1, 0)  # Convert -1 to 1 for anomaly, 1 to 0 for normal

# Find the top 5 columns with the most anomalies predicted by Isolation Forest
top_5_columns = np.argsort(np.sum(np.abs(if_model.decision_function(data[numerical_columns]))))

# Plot the distributions of the top 5 most anomalous columns
for col_index in top_5_columns[:5]:
    column_name = numerical_columns[col_index]
    plt.figure(figsize=(10, 6))
    plt.hist(data[column_name], bins=50, label=f'Distribution of {column_name}')
    plt.title(f'Distribution of {column_name}')
    plt.xlabel(column_name)
    plt.ylabel('Frequency')
    plt.legend()
    plt.savefig(f'{column_name}_distribution.png')
    plt.show()

# Save the updated dataset
data.to_csv('updated_with_anomalies_iforest_stammdaten.csv', index=False)


In [None]:
# Scatter plots for detected anomalies
plt.figure(figsize=(15, 10))
for col_index in range(len(numerical_columns)):
    column_name = numerical_columns[col_index]
    # Plot anomalies detected by XGBoost
    plt.scatter(data[column_name][data['if_anomaly'] == 1], data[column_name][data['if_anomaly'] == 1], c='red', label='Anomaly')
    # Plot normal data points
    plt.scatter(data[column_name][data['if_anomaly'] == 0], data[column_name][data['if_anomaly'] == 0], c='blue', label='Normal')
    plt.xlabel(column_name)
    plt.ylabel('Value')
    plt.title(f'Scatter Plot of {column_name}')
    plt.legend()
    plt.savefig(f'{column_name}_scatter.png')
    plt.show()

In [None]:
# Calculate ratios of detected anomalies on each column
anomaly_ratios = (data.filter(regex='_outlier$').sum() / len(data)).sort_values(ascending=False)

# Identify the most bizarre values of each column
most_bizarre_values = {}
for col in numerical_columns:
    # Calculate Z-score for each column
    z_scores = np.abs(stats.zscore(data[col]))
    # Identify the most bizarre value
    most_bizarre_value = data.loc[np.argmax(z_scores), col]
    most_bizarre_values[col] = most_bizarre_value

# Write the report to a text file
with open("anomaly_report_iforest.txt", "w") as f:
    f.write("Anomaly Ratios:\n")
    for col, ratio in anomaly_ratios.items():
        f.write(f"{col}: {ratio:.2f}\n")
    f.write("\nMost Bizarre Values:\n")
    for col, value in most_bizarre_values.items():
        f.write(f"{col}: {value}\n")
        f.write(f"Description: This value might occur due to ...\n\n")

In [None]:
# Drop columns with NaN values
data = df.dropna(axis=1)
    
categorical_columns = ["Materialnummer", "Lieferant OB", "Vertragsposition OB", "Beschaffungsart", "Disponent", "Einkäufer", "Dispolosgröße", "Werk OB", "Warengruppe", "Basiseinheit"]
numerical_columns = ["Planlieferzeit Vertrag", "Vertrag Fix1", "Vertrag_Fix2", "Gesamtbestand", "Gesamtwert", "Preiseinheit", "WE-Bearbeitungszeit", "Planlieferzeit Mat-Stamm"]
    
data[categorical_columns] = data[categorical_columns].astype('category')
data[numerical_columns] = data[numerical_columns].astype('int64')

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from scipy import stats
import matplotlib.pyplot as plt

# Handling missing values by replacing them with the median of each column
for col in numerical_columns:
    if data[col].isna().any():
        data[col].fillna(data[col].median(), inplace=True)

# Applying Z-score for anomaly detection in numeric columns
for col in numerical_columns:
    data[col + '_z_score'] = np.abs(stats.zscore(data[col]))
    data[col + '_outlier'] = 0
    data.loc[data[col + '_z_score'] > 3, col + '_outlier'] = 1  # Any Z-score > 3 is considered an outlier

# Combine all outlier flags to a single anomaly label
data['anomaly_label'] = data[[col + '_outlier' for col in numerical_columns]].max(axis=1)

# Train XGBoost model for anomaly detection
xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
xgb_model.fit(data[numerical_columns], data['anomaly_label'])

# Predict anomaly labels using XGBoost
data['xgb_anomaly'] = xgb_model.predict(data[numerical_columns])

# Find the top 5 columns with the most anomalies predicted by XGBoost
top_5_columns = xgb_model.feature_importances_.argsort()[-5:][::-1]

# Plot the distributions of the top 5 most anomalous columns
for col_index in top_5_columns:
    column_name = numerical_columns[col_index]
    plt.figure(figsize=(10, 6))
    plt.hist(data[column_name], bins=50, label=f'Distribution of {column_name}')
    plt.title(f'Distribution of {column_name}')
    plt.xlabel(column_name)
    plt.ylabel('Frequency')
    plt.legend()
    plt.savefig(f'{column_name}_distribution.png')
    plt.show()

# Save the updated dataset
data.to_csv('updated_with_anomalies_xgboost_stammdaten.csv', index=False)


In [None]:
# Scatter plots for detected anomalies
plt.figure(figsize=(15, 10))
for col_index in range(len(numerical_columns)):
    column_name = numerical_columns[col_index]
    # Plot anomalies detected by XGBoost
    plt.scatter(data[column_name][data['xgb_anomaly'] == 1], data[column_name][data['xgb_anomaly'] == 1], c='red', label='Anomaly')
    # Plot normal data points
    plt.scatter(data[column_name][data['xgb_anomaly'] == 0], data[column_name][data['xgb_anomaly'] == 0], c='blue', label='Normal')
    plt.xlabel(column_name)
    plt.ylabel('Value')
    plt.title(f'Scatter Plot of {column_name}')
    plt.legend()
    plt.savefig(f'{column_name}_scatter.png')
    plt.show()


In [None]:
# Calculate ratios of detected anomalies on each column
anomaly_ratios = (data.filter(regex='_outlier$').sum() / len(data)).sort_values(ascending=False)

# Identify the most bizarre values of each column
most_bizarre_values = {}
for col in numerical_columns:
    # Calculate Z-score for each column
    z_scores = np.abs(stats.zscore(data[col]))
    # Identify the most bizarre value
    most_bizarre_value = data.loc[np.argmax(z_scores), col]
    most_bizarre_values[col] = most_bizarre_value

# Write the report to a text file
with open("anomaly_report.txt", "w") as f:
    f.write("Anomaly Ratios:\n")
    for col, ratio in anomaly_ratios.items():
        f.write(f"{col}: {ratio:.2f}\n")
    f.write("\nMost Bizarre Values:\n")
    for col, value in most_bizarre_values.items():
        f.write(f"{col}: {value}\n")
        f.write(f"Description: This value might occur due to ...\n\n")