<a href="https://colab.research.google.com/github/himanshudhami/BuiltbyChatGPT/blob/main/Datapredictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
import pandas as pd
import random
import string

# Function to generate a random customer ID
def random_customer_id():
    return ''.join(random.choices(string.ascii_uppercase + string.digits, k=8))

# Function to generate a random item ID
def random_item_id():
    return 'IT' + ''.join(random.choices(string.digits, k=6))

# Function to generate a random price
def random_price():
    return round(random.uniform(1, 500), 2)

# Generate the sample dataset
data = []
num_samples = 1000

for _ in range(num_samples):
    data.append({'value': random_customer_id(), 'type': 'customerid'})
    data.append({'value': random_item_id(), 'type': 'itemid'})
    data.append({'value': random_price(), 'type': 'price'})

# Create a pandas DataFrame
df = pd.DataFrame(data)

# Shuffle the dataset
df = df.sample(frac=1).reset_index(drop=True)

# Save the dataset to a CSV file
df.to_csv('sample_data.csv', index=False)

print(df.head())


      value        type
0  IT886719      itemid
1    355.41       price
2    232.96       price
3    238.34       price
4  OXHFDXK3  customerid


In [37]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('sample_data.csv')

# Feature extraction function
def extract_features(value):
    # Check if the value is a number (float or integer)
    is_number = 1 if isinstance(value, (int, float)) or str(value).replace(".", "", 1).isdigit() else 0
    
    # Check if the value has a decimal point
    has_decimal_point = 1 if '.' in str(value) else 0
    
    # Check if the value starts with 'IT' (common itemid prefix in our sample data)
    starts_with_it = 1 if str(value).startswith('IT') else 0
    
    # Compute the length of the value
    length = len(str(value))
    
    return [is_number, has_decimal_point, starts_with_it, length]

# Apply the feature extraction function to the dataset
df['features'] = df['value'].apply(extract_features)

# Convert the features column to a DataFrame and merge it with the original dataset
features_df = pd.DataFrame(df['features'].tolist(), columns=['is_number', 'has_decimal_point', 'starts_with_it', 'length'])
df = pd.concat([df, features_df], axis=1)

# Drop the original value and features columns
df.drop(['value', 'features'], axis=1, inplace=True)
df.to_csv('sample_data_with_features.csv', index=False)
print(df.head())


         type  is_number  has_decimal_point  starts_with_it  length
0      itemid          0                  0               1       8
1       price          1                  1               0       6
2       price          1                  1               0       6
3       price          1                  1               0       6
4  customerid          0                  0               0       8


In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('sample_data_with_features.csv')

# Separate features (X) and labels (y)
X = df.drop('type', axis=1)
y = df['type']

# Split the dataset into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Save the training, validation, and testing sets to CSV files
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

train_data.to_csv('train_data.csv', index=False)
val_data.to_csv('val_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

print("Training data shape:", train_data.shape)
print("Validation data shape:", val_data.shape)
print("Testing data shape:", test_data.shape)


Training data shape: (2100, 5)
Validation data shape: (450, 5)
Testing data shape: (450, 5)


In [39]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load training and validation data
train_data = pd.read_csv('train_data.csv')
val_data = pd.read_csv('val_data.csv')

# Separate features (X) and labels (y)
X_train = train_data.drop('type', axis=1)
y_train = train_data['type']
X_val = val_data.drop('type', axis=1)
y_val = val_data['type']

# Create and train the Random Forest classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the validation dataset
y_val_pred = clf.predict(X_val)

# Evaluate the classifier's performance
print("Accuracy on validation data:", accuracy_score(y_val, y_val_pred))
print("\nClassification report:\n", classification_report(y_val, y_val_pred))


Accuracy on validation data: 1.0

Classification report:
               precision    recall  f1-score   support

  customerid       1.00      1.00      1.00       136
      itemid       1.00      1.00      1.00       163
       price       1.00      1.00      1.00       151

    accuracy                           1.00       450
   macro avg       1.00      1.00      1.00       450
weighted avg       1.00      1.00      1.00       450



In [41]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# Load training and validation data
train_data = pd.read_csv('train_data.csv')
val_data = pd.read_csv('val_data.csv')

# Separate features (X) and labels (y)
X_train = train_data.drop('type', axis=1)
y_train = train_data['type']
X_val = val_data.drop('type', axis=1)
y_val = val_data['type']

# Define the hyperparameter search space
param_dist = {
    'n_estimators': [10, 50, 100, 200, 300, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
}

# Create the Random Forest classifier
clf = RandomForestClassifier(random_state=42)

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(
    clf, param_distributions=param_dist, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1
)

# Fit the RandomizedSearchCV object to the training data
random_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", random_search.best_params_)

# Evaluate the best model on the validation dataset
best_clf = random_search.best_estimator_
y_val_pred = best_clf.predict(X_val)

# Evaluate the classifier's performance
print("Accuracy on validation data:", accuracy_score(y_val, y_val_pred))
print("\nClassification report:\n", classification_report(y_val, y_val_pred))


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best hyperparameters: {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 40}
Accuracy on validation data: 1.0

Classification report:
               precision    recall  f1-score   support

  customerid       1.00      1.00      1.00       136
      itemid       1.00      1.00      1.00       163
       price       1.00      1.00      1.00       151

    accuracy                           1.00       450
   macro avg       1.00      1.00      1.00       450
weighted avg       1.00      1.00      1.00       450



In [42]:
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score

# Load testing data
test_data = pd.read_csv('test_data.csv')

# Separate features (X) and labels (y)
X_test = test_data.drop('type', axis=1)
y_test = test_data['type']

# Assuming the best_clf variable contains the trained classifier with the best hyperparameters
# Make predictions on the testing dataset
y_test_pred = best_clf.predict(X_test)

# Evaluate the classifier's performance
print("Accuracy on testing data:", accuracy_score(y_test, y_test_pred))
print("\nClassification report:\n", classification_report(y_test, y_test_pred))


Accuracy on testing data: 1.0

Classification report:
               precision    recall  f1-score   support

  customerid       1.00      1.00      1.00       150
      itemid       1.00      1.00      1.00       155
       price       1.00      1.00      1.00       145

    accuracy                           1.00       450
   macro avg       1.00      1.00      1.00       450
weighted avg       1.00      1.00      1.00       450



In [47]:
# Feature extraction function (same as in Step 2)
def extract_features(value):
    is_number = 1 if isinstance(value, (int, float)) or str(value).replace(".", "", 1).isdigit() else 0
    has_decimal_point = 1 if '.' in str(value) else 0
    starts_with_it = 1 if str(value).startswith('IT') else 0
    length = len(str(value))
    return [is_number, has_decimal_point, starts_with_it, length]

# Function to predict the type of a given value
def predict_value_type(value, model):
    # Extract features from the input value
    features = extract_features(value)
    
    # Make a prediction using the trained model
    prediction = model.predict([features])[0]
    
    return prediction

# Example usage
input_value = "12.00"
value_type = predict_value_type(input_value, best_clf)
print("The input value is of type:", value_type)


The input value is of type: price


