In [986]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

In [987]:
# Load datasets
features = pd.read_csv('train.csv')
labels = pd.read_csv('trainLabels.csv')

In [988]:
test_df = pd.read_csv('test.csv')
test_df.columns = features.columns

In [989]:
# Drop id column
features = features.drop(columns=['id'])
labels = labels.drop(columns=['id'])

In [990]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Columns: 145 entries, x1 to x145
dtypes: float64(55), int64(30), object(60)
memory usage: 11.1+ MB


In [991]:
print(features.isnull())

         x1     x2     x3     x4     x5     x6     x7     x8     x9    x10  \
0     False  False  False  False  False  False  False  False  False  False   
1      True   True   True   True  False  False  False  False  False   True   
2     False  False  False  False  False  False  False  False  False  False   
3     False  False  False  False  False  False  False  False  False  False   
4     False  False  False  False  False  False  False  False  False  False   
...     ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
9994  False  False  False  False  False  False  False  False  False  False   
9995  False  False  False  False  False  False  False  False  False  False   
9996  False  False  False  False  False  False  False  False  False  False   
9997  False  False  False  False  False  False  False  False  False  False   
9998  False  False  False  False  False  False  False  False  False  False   

      ...   x136   x137   x138   x139   x140   x141   x142   x1

In [992]:
# Identify rows with NaN values in train.csv
rows_with_nan = features[features.isnull().any(axis=1)].index

# Drop corresponding rows from both train.csv and trainLabels.csv
features = features.drop(rows_with_nan)
labels = labels.drop(rows_with_nan)

# Reset index after dropping rows
features.reset_index(drop=True, inplace=True)
labels.reset_index(drop=True, inplace=True)

# Verify shapes to ensure alignment
print("Shape of train data after dropping NaN rows:", features.shape)
print("Shape of labels data after dropping NaN rows:", labels.shape)


Shape of train data after dropping NaN rows: (7402, 145)
Shape of labels data after dropping NaN rows: (47402, 33)


In [993]:
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47402 entries, 0 to 47401
Data columns (total 33 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   y1      47402 non-null  int64
 1   y2      47402 non-null  int64
 2   y3      47402 non-null  int64
 3   y4      47402 non-null  int64
 4   y5      47402 non-null  int64
 5   y6      47402 non-null  int64
 6   y7      47402 non-null  int64
 7   y8      47402 non-null  int64
 8   y9      47402 non-null  int64
 9   y10     47402 non-null  int64
 10  y11     47402 non-null  int64
 11  y12     47402 non-null  int64
 12  y13     47402 non-null  int64
 13  y14     47402 non-null  int64
 14  y15     47402 non-null  int64
 15  y16     47402 non-null  int64
 16  y17     47402 non-null  int64
 17  y18     47402 non-null  int64
 18  y19     47402 non-null  int64
 19  y20     47402 non-null  int64
 20  y21     47402 non-null  int64
 21  y22     47402 non-null  int64
 22  y23     47402 non-null  int64
 23  y24     474

In [994]:
labels = labels.iloc[:features.shape[0]]

In [995]:
class HashingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.bow_transformer = CountVectorizer(tokenizer=lambda x: x.split(), preprocessor=lambda x: x)
        self.tfidf_transformer = TfidfVectorizer(tokenizer=lambda x: x.split(), preprocessor=lambda x: x)

    def fit(self, X, y=None):
        hash_bow_tokens = X.apply(lambda row: ' '.join(row), axis=1)
        hash_tfidf_tokens = X.apply(lambda row: ' '.join(row), axis=1)
        self.bow_transformer.fit(hash_bow_tokens)
        self.tfidf_transformer.fit(hash_tfidf_tokens)
        return self
        
    def transform(self, X):
        hash_bow_tokens = X.apply(lambda row: ' '.join(row), axis=1)
        hash_tfidf_tokens = X.apply(lambda row: ' '.join(row), axis=1)
        bow_matrix = self.bow_transformer.transform(hash_bow_tokens)
        tfidf_matrix = self.tfidf_transformer.transform(hash_tfidf_tokens)
        return np.hstack((bow_matrix.toarray(), tfidf_matrix.toarray()))

def crepre(numerical_cols, categorical_cols, hash_cols):
    # Pipeline for numerical features: imputation and scaling
    numerical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    # Pipeline for categorical features: imputation and encoding
    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    # Custom transformer for hashing columns
    hash_pipeline = Pipeline([
        ('hash', HashingTransformer())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_pipeline, numerical_cols),
            ('cat', categorical_pipeline, categorical_cols),
            ('hash', hash_pipeline, hash_cols)
        ]
    )
    
    return preprocessor

In [996]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
X_train

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x136,x137,x138,x139,x140,x141,x142,x143,x144,x145
210,NO,NO,NosfhgFjAjWdTRV/wjiMuvKQb+09duB06CRuVptbuz0=,4LhhvTzxwvh2SnFtcpaRasyvph66a3YDIQCshAfyS2o=,1.413677,0.000000,1.000000,0.000000,0.226011,NO,...,-1.000000,1.000,1261,892,YES,NO,YES,1,0.170404,0.214909
2487,NO,NO,lsLkdilrSTL8RRVEk9g0jtW1gbqOdFpSJi0mcE5OfNI=,3yK2OPj1uYDsoMgsxsjY1FxXkOllD8Xfh20VYGqT+nU=,1.415919,0.000000,1.000000,0.000000,0.114014,NO,...,4.000000,1.000,1263,892,NO,NO,NO,1,0.881166,0.100554
6308,NO,NO,hsFLVug96pqc0BwM/lwhDhj02OAsIoXuif1dp+XJxq0=,XSJ6E8aAoZC7/KAu3eETpfMg3mCq7HVBFIVIsoMKh9E=,0.279927,0.121535,0.967103,0.119708,0.137842,YES,...,-1.000000,0.890,4672,3283,YES,NO,YES,1,0.841913,0.130779
110,NO,NO,qO7umaUxgyEq+R2IEyK3YWPC2JVOTwwdx+RSXE6850k=,7GjnXIaqLV/fxjKUHRk+et7LlMVoaP1iUHGLt0gUz5w=,0.624386,0.035234,0.982063,0.056161,0.211672,NO,...,0.000000,0.960,3307,4683,YES,NO,YES,0,0.103993,0.200181
51,NO,NO,6sYBOBasLUysRxm3AIlWhlQjyWkf5uYSVDmIANt7ABA=,aLEeZ8ZFKt2jQfkG5e9Nmad+QJlfpPmSfQS3CHlL6Ik=,1.413677,0.000000,1.000000,0.000000,0.425852,NO,...,0.000000,1.000,1261,892,YES,NO,YES,6,0.674888,0.416336
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5191,YES,YES,D3AaHEwVestK60YV8N50H314kC708AEWX9ifjjmMgDk=,WV5vAHFyqkeuyFB5KVNGFOBuwjkUGKYc8wh9QfpVzAA=,1.062519,0.050740,0.942616,0.451827,0.660317,YES,...,0.000000,0.920,4672,3311,NO,NO,NO,7,0.704621,0.649615
5226,NO,NO,wTh+7YmMwS6rbbe2JpgOL0cmhuP8zZrpLJh1YwpDdB0=,2zAcjChkLnuzoJYiDXJXndDH0wJtnqxElY2LpjWZRrU=,1.135814,0.092559,0.918330,0.236540,0.283604,NO,...,3.333333,0.870,4672,3306,YES,NO,YES,2,0.472474,0.272046
5390,NO,NO,dqfmyLacCX5hLmhxum5iNXXj+k7WubTyMXBrmucmhCE=,565Vsa15yrjN4LS9MGuMGKd33aB6m7CNE7L5WvV7s+8=,1.316601,0.071364,0.943151,0.498034,0.492834,NO,...,0.000000,0.905,4675,3307,NO,NO,NO,5,0.428485,0.481711
860,NO,NO,fY06/VaeouwrJ6DegRsQsWf3P+ISSk5roLsGU+LURfo=,BWlzsfzvLpUVVqvMBbjZ4zlrnQb/agQ7zCXv27i3RUw=,1.413677,0.000000,1.000000,0.000000,0.694687,YES,...,0.000000,1.000,1261,892,YES,NO,YES,13,0.737668,0.685964


In [997]:
# Create a full pipeline with preprocessing and model
# Identify categorical, numerical, and hash columns
categorical_cols = X_train.select_dtypes(include=['object']).columns
numerical_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
hash_cols = [col for col in categorical_cols if X_train[col].str.len().max() > 20] # assuming hash columns have long strings
categorical_cols = [col for col in categorical_cols if col not in hash_cols]


In [998]:
preprocessor = crepre(numerical_cols,categorical_cols,hash_cols)
preprocessor.fit(X_train)
X_train_transformed = preprocessor.transform(X_train)
model = Pipeline(steps=[
    ('classifier', MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42)))
])



In [999]:
model.fit(X_train_transformed, y_train)

In [1000]:
X_test_transformed = preprocessor.transform(X_test)
predictions = model.predict(X_test_transformed)

In [1001]:
predictions.shape

(1481, 33)

In [1002]:
y_test_flat = np.array(y_test).ravel()
y_test_pred_flat = predictions.ravel()

# Accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test_flat, y_test_pred_flat)
accuracy

0.988439424631187

In [1005]:
test = test_df
test_df = test_df.drop(columns=['id'])


In [1006]:
rows_with_nan = test_df[test_df.isnull().any(axis=1)].index

#Drop corresponding rows from both train.csv and trainLabels.csv
test_df = test_df.drop(rows_with_nan)

#Reset index after dropping rows
test_df.reset_index(drop=True, inplace=True)


In [1007]:
rows_with_nan = test[test.isnull().any(axis=1)].index

# Step 2: Drop corresponding rows from both train.csv and trainLabels.csv
test = test.drop(rows_with_nan)

# Optional: Reset index after dropping rows
test.reset_index(drop=True, inplace=True)


In [1008]:
test

Unnamed: 0,id,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x136,x137,x138,x139,x140,x141,x142,x143,x144,x145
0,1698002,NO,NO,9ACcuXc7MMm9V7jZSr3P3VxAKyMvLAtsdwPKwgncc+k=,WV5vAHFyqkeuyFB5KVNGFOBuwjkUGKYc8wh9QfpVzAA=,0.832679,0.049834,0.945938,0.317427,0.482021,...,1.0,0.866667,4672,3311,NO,NO,NO,5,0.945032,0.471318
1,1698003,NO,NO,MeBJ/ZzEIXfNKat4w1oeDxiMNKrAeY0PH41i00hpYDo=,tnLDGLnpYhzsik5+X+WPo4KQJoQA0TfWRlmEtQ3XNJQ=,1.415919,0.000000,1.000000,0.000000,0.703088,...,-1.0,1.000000,1263,892,NO,NO,NO,8,0.557175,0.693587
2,1698005,NO,NO,uduY7XWJ8eFgTltv5P0rPh5GW6KwBu+tPFH13uQRN+0=,0L7+hNDV8S57etySgdljbm2AK1zQuLP77lGk2hyEmCo=,1.129212,0.087020,0.814240,1.112804,0.874318,...,0.0,0.870000,4400,3413,YES,NO,YES,2,0.224729,0.870909
3,1698006,NO,NO,kM4KU87XvnvKRvf4dN3Tu4zQYq8fpcqhDTFADWdfCg8=,4LhhvTzxwvh2SnFtcpaRasyvph66a3YDIQCshAfyS2o=,1.415919,0.000000,1.000000,0.000000,0.232779,...,0.0,1.000000,1263,892,YES,NO,YES,6,0.536996,0.223278
4,1698007,NO,NO,NMFPnlbm6YWoxpG5KdcfdDMWWSZ7FYAUlJoLtkBBf9k=,3yK2OPj1uYDsoMgsxsjY1FxXkOllD8Xfh20VYGqT+nU=,1.414798,0.000000,1.000000,0.000000,0.323296,...,-1.0,1.000000,1262,892,YES,YES,YES,2,0.863229,0.323296
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1460,1699994,NO,NO,X/hdUOVR5KuExVGLzjhLcM2CyIqym9t0Nh+ZX05M+1w=,+yhSY//Hpg7u0bSA7NYmcmRFgv3bF4Tw3BMHrBqaTtA=,1.415919,0.000000,1.000000,0.000000,0.297704,...,0.0,1.000000,1263,892,NO,NO,NO,11,0.809417,0.288203
1461,1699996,NO,NO,Z6vucL/W0MPoFsgu2ewNXrvNCAQFiKzUJTYuqh6lP28=,yhI9Bw5Q8l1vEll4sw/Tem/jojpE9KwjKvQQIyrAqgU=,1.294118,0.000000,1.000000,0.000000,0.164141,...,0.0,1.000000,1188,918,YES,NO,YES,2,0.198257,0.155724
1462,1699997,NO,NO,LKQ9Uh6tQ3ZrIxAKaPaDEuiYFunnK/2d+oKAfpN9tuY=,h0cPLYjd7nmw9FJsQA+KUsnChH0SajbHjNdfMk47k9o=,1.020217,0.583944,0.625842,1.003516,0.791136,...,0.0,0.720000,4400,3413,YES,NO,YES,0,0.582479,0.778864
1463,1699998,NO,NO,/tuZYGMsFx4A/Ou+jSol6t/TpLRkSl8Ku+1tnQPvwww=,aLEeZ8ZFKt2jQfkG5e9Nmad+QJlfpPmSfQS3CHlL6Ik=,0.354706,0.550882,0.930882,0.207941,0.207500,...,0.0,0.845000,4400,3400,NO,NO,NO,5,0.930588,0.201591


In [1014]:
id = test['id']

In [1009]:
Test_transformed = preprocessor.transform(test_df)
pred_test = model.predict(Test_transformed)

In [1010]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1465 entries, 0 to 1464
Columns: 145 entries, x1 to x145
dtypes: float64(55), int64(30), object(60)
memory usage: 1.6+ MB


In [1011]:
pred_test.shape

(1465, 33)

In [1022]:
df = pd.DataFrame(pred_test)

In [1013]:
df.columns = labels.columns
df

Unnamed: 0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,...,y24,y25,y26,y27,y28,y29,y30,y31,y32,y33
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1460,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1461,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1462,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1463,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [1016]:
df.insert(0,'id',id)

In [1018]:
formatted_rows = []
for index, row in df.iterrows():
    id_value = row['id']
    for i in range(1, 34):  # Assuming columns y1 to y33
        column_name = f'y{i}'
        value = row[column_name]
        formatted_rows.append(f'{id_value}_{column_name},{value}')

In [1020]:
formatted_rows.insert(0, 'id_label,pred')

In [1021]:
output_file = 'assign_submission.csv'
with open(output_file, 'w') as f:
    f.write('\n'.join(formatted_rows))