<a href="https://colab.research.google.com/github/imazerty/1project/blob/master/make_folds_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install iterative-stratification

Collecting iterative-stratification
  Downloading https://files.pythonhosted.org/packages/9d/79/9ba64c8c07b07b8b45d80725b2ebd7b7884701c1da34f70d4749f7b45f9a/iterative_stratification-0.1.6-py3-none-any.whl
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.6


In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
import copy
import numpy as np
import argparse
from sklearn.preprocessing import MultiLabelBinarizer
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import os
import pickle
from pathlib import Path
CACHE_DIR = Path('/content/cache/data/')
CACHE_DIR.mkdir(parents=True ,exist_ok=True)

def load_drop_unused_excel():
  df = pd.read_excel("/content/df_j_i.xlsx")
  df1 = df.loc[df.UNUSED != 'x']
  print("{} samples were dropped ...".format(df.LABELS.size - df1.LABELS.size))
  print("{} samples were retained ...".format(df1.LABELS.size))
  return df1

def expand_labels():
  """
  ONE HOT ENCODE
  """
  print("expanding labels ...")
  #load train csv
  df = pd.read_excel("/content/df_j_i.xlsx")
  #drop unused rows
  df = df[df.UNUSED != 'x'].iloc[:,:3]
  print(df.isnull().any())
  #deleting consecutive spaces
  df.LABELS = df.LABELS.str.strip()
  print(df.LABELS.isnull().index)
  #defune all possible labels
  tags = pd.read_excel('/content/multi_labels (1) (1).xlsx')
  unique_tags = tags.Name.unique()
  #one_hot_encode
  mlb = MultiLabelBinarizer(classes=unique_tags)
  labels = pd.DataFrame(mlb.fit_transform(df['LABELS']),columns=mlb.classes_)
  labels.insert(0, 'ID', df.ID.values)
  labels.insert(1, 'URL', df.URL.values)
  return labels



def make_folds(n_folds: int, min_occurrence = 30) -> pd.DataFrame:
  df = expand_labels()
  skf = MultilabelStratifiedKFold(n_splits=n_folds, random_state=42,shuffle=True)
  print("Creating folds ...")
  #labels occurring > min_occurrence
  labels_to_use = (np.sum(df.iloc[:,2:].values, axis = 0)> min_occurrence)
  #rows with no such labels
  empty_rows = (np.sum(df.iloc[:,2:].values[:,labels_to_use], axis=1) == 0)
  print('empty rows: ', sum(empty_rows))
  #print("Eligible labels: {} \ {} ".format(df[labels_to_use].columns, sum(labels_to_use)))
  #keep the relevant rows
  df = df[~empty_rows]
  #initialize folds
  folds = np.array([-1] * len(df))
  #generating number of fold for each row
  for fold, (_,valid_idx) in enumerate(skf.split(df.ID, df.iloc[:, 2:].values[:, labels_to_use])):
    folds[valid_idx] = fold
  #creating new column in df 
  df['fold'] = folds
  return df

def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('--n_folds', type=int, default=5)
  args = parser.parse_args(['--n_folds', '5'])
  df = make_folds(n_folds=args.n_folds)
  df.to_pickle(os.path.join(CACHE_DIR,'folds.pkl'))

if __name__ == '__main__':
  main()

expanding labels ...
ID        False
URL        True
LABELS     True
UNUSED     True
dtype: bool
Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            6328, 6329, 6330, 6331, 6334, 6344, 6345, 6349, 6350, 6351],
           dtype='int64', length=5735)
Creating folds ...
empty rows:  2


  .format(sorted(unknown, key=str)))


In [0]:
folds = pd.read_pickle(CACHE_DIR / 'folds.pkl')

folds.head(3)

Unnamed: 0,ID,URL,A,B,C,D,E,F,G,H,H1,H2,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,fold
0,media_juldebar_Deep_Mapping_4To_data_deep_mapp...,http://162.38.140.205/tmp/Deep_mapping/session...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,2
1,media_juldebar_Deep_Mapping_4To_data_deep_mapp...,http://162.38.140.205/tmp/Deep_mapping/session...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,media_juldebar_Deep_Mapping_4To_data_deep_mapp...,http://162.38.140.205/tmp/Deep_mapping/session...,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2


In [0]:
  df = pd.read_excel("/content/df_j_i.xlsx")
  
  #drop unused rows
  df = df[df.UNUSED != 'x'].iloc[:,:3]
  print(df.isnull().any())
  #deleting consecutive spaces
  df.LABELS = df.LABELS.str.strip()
  print(df[df.LABELS.isnull().values])

ID        False
URL        True
LABELS    False
dtype: bool
Empty DataFrame
Columns: [ID, URL, LABELS]
Index: []
