##### The cell below is for you to keep track of the libraries used and install those libraries quickly
##### Ensure that the proper library names are used and the syntax of `%pip install PACKAGE_NAME` is followed

In [9]:
#%pip install pandas 
#%pip install matplotlib
# add commented pip installation lines for packages used as shown above for ease of testing
# the line should be of the format %pip install PACKAGE_NAME 

## **DO NOT CHANGE** the filepath variable
##### Instead, create a folder named 'data' in your current working directory and 
##### have the .csv file inside that. A relative path *must* be used when loading data into pandas

In [13]:
# Can have as many cells as you want for code
import pandas as pd
filepath = "./data/catA_train.csv" 
# the initialised filepath MUST be a relative path to a folder named data that contains the parquet file

### **ALL** Code for machine learning and dataset analysis should be entered below. 
##### Ensure that your code is clear and readable.
##### Comments and Markdown notes are advised to direct attention to pieces of code you deem useful.

In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import datetime
from sklearn.model_selection import KFold
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

ModuleNotFoundError: No module named 'sklearn'

In [12]:
# Read data
data = pd.read_csv('catA_train.csv')
data

FileNotFoundError: [Errno 2] No such file or directory: 'catA_train.csv'

In [None]:
data.corr()

In [None]:
data.info()

In [None]:
# Check the number of missing values in each column
data.isnull().sum()

In [None]:
# Drop rows with missing values in LONGITUDE or LATITUDE
data.dropna(subset=['LATITUDE','LONGITUDE'], inplace=True)
data

# Drop inactive companies
active_statuses = ['Active']
mask = data['Company Status (Active/Inactive)'].isin(active_statuses)
data = data[mask]
data

In [None]:
# Encode data
le=LabelEncoder()
data['AccountID']=le.fit_transform(data['AccountID'])
data['Company']=le.fit_transform(data['Company'])
data['Industry']=le.fit_transform(data['Industry'])
data['8-Digit SIC Description']=le.fit_transform(data['8-Digit SIC Description'])
data['Entity Type']=le.fit_transform(data['Entity Type'])
data['Parent Company']=le.fit_transform(data['Parent Company'])
data['Parent Country']=le.fit_transform(data['Parent Country'])
data['Ownership Type']=le.fit_transform(data['Ownership Type'])
data['Company Description']=le.fit_transform(data['Company Description'])
data['Company Status (Active/Inactive)']=le.fit_transform(data['Company Status (Active/Inactive)'])
data['Import/Export Status']=le.fit_transform(data['Import/Export Status'])
data['Fiscal Year End']=le.fit_transform(data['Fiscal Year End'])
data['Global Ultimate Company']=le.fit_transform(data['Global Ultimate Company'])
data['Global Ultimate Country']=le.fit_transform(data['Global Ultimate Country'])
data['Domestic Ultimate Company']=le.fit_transform(data['Domestic Ultimate Company'])
data

In [None]:
columns_to_onehot = ['AccountID', 'Company', 'Industry', '8-Digit SIC Description', 'Entity Type',
                     'Parent Company', 'Parent Country', 'Ownership Type', 'Company Description',
                     'Company Status (Active/Inactive)', 'Import/Export Status', 'Fiscal Year End',
                     'Global Ultimate Company', 'Global Ultimate Country', 'Domestic Ultimate Company']

# Apply one-hot encoding
data_encoded = pd.get_dummies(data, columns=columns_to_onehot)


data_encoded.head()

In [None]:
selected_features = ['LATITUDE', 'LONGITUDE', '8-Digit SIC Code', 'Year Found', 'Employees (Global Ultimate Total)']
X = data[selected_features]
y = data['Sales (Global Ultimate Total USD)']
X = X.dropna()
y = y.dropna()
print(X.shape)
print(y.shape)
y = y[:X.shape[0]]
print(y.shape)


We first use RFC to classify the data.

In [None]:
# train test split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor()

model.fit(X_train, y_train)


predictions = model.predict(X_test)

# model evaluation
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'R-squared: {r2}')

# visualise results
plt.scatter(y_test, predictions)
plt.xlabel('Actual Sales')
plt.ylabel('Predicted Sales')
plt.title('Actual vs. Predicted Sales')
plt.show()

We also use KNN to classify the data.

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

predictions = knn.predict(X_test)
accuracy = knn.score(X_test, y_test)
conf_matrix = confusion_matrix(y_test, predictions)
classification_rep = classification_report(y_test, predictions)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{classification_rep}')

We also use Logistic Regression to analyse the data.

In [None]:
model = LogisticRegression()

model.fit(X_train, y_train)

predictions = model.predict(X_test)

# Evaluate the Model
accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
classification_rep = classification_report(y_test, predictions)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{classification_rep}')

In [None]:
# Visualise the logistic regression
report_text = f'Classification Report:\n{classification_rep}'

# Parse the classification report text
report_lines = report_text.split('\n')[2:-2]
classes = []
precision = []
recall = []
f1_score = []
support = []

for line in report_lines:
    values = line.split()
    if len(values) == 5:
        classes.append(values[0])
        precision.append(float(values[1]))
        recall.append(float(values[2]))
        f1_score.append(float(values[3]))
        support.append(int(values[4]))

# Create a DataFrame
report_dict = {
    'Class': classes,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1_score,
    'Support': support
}
report_df = pd.DataFrame(report_dict)

# Plotting
plt.figure(figsize=(10, 6))
sns.barplot(x='Class', y='Precision', data=report_df, color='blue', label='Precision')
sns.barplot(x='Class', y='Recall', data=report_df, color='green', label='Recall')
sns.barplot(x='Class', y='F1-Score', data=report_df, color='orange', label='F1-score')

plt.title('Precision, Recall, and F1-Score for Each Class')
plt.xlabel('Class')
plt.ylabel('Score')
plt.legend()
plt.xticks(rotation=90)
plt.show()


## The cell below is **NOT** to be removed
##### The function is to be amended so that it accepts the given input (dataframe) and returns the required output (list). 
##### It is recommended to test the function out prior to submission
-------------------------------------------------------------------------------------------------------------------------------
##### The hidden_data parsed into the function below will have the same layout columns wise as the dataset *SENT* to you
##### Thus, ensure that steps taken to modify the initial dataset to fit into the model are also carried out in the function below

In [11]:
def testing_hidden_data(hidden_data: pd.DataFrame) -> list:
    '''DO NOT REMOVE THIS FUNCTION.

The function accepts a dataframe as input and return an iterable (list)
of binary classes as output.

The function should be coded to test on hidden data
and should include any preprocessing functions needed for your model to perform. 
    
All relevant code MUST be included in this function.'''

    result = [] 
    return result

##### Cell to check testing_hidden_data function

In [None]:
# This cell should output a list of predictions.
test_df = pd.read_csv(filepath)
test_df = test_df.drop(columns=['Sales (Domestic Ultimate Total USD)'])
print(testing_hidden_data(test_df))

### Please have the filename renamed and ensure that it can be run with the requirements above being met. All the best!