# **MultiLabel**

### **Loading Libraries**

In [1]:
# Operating Systems
import os
import shutil

# Numerical Computing
import numpy as np

# Data Manipuation
import pandas as pd

# SciPy
import scipy
from scipy import stats

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# BigQuery
from google.cloud import bigquery
from google.colab import auth

# Scikit-Learn
from sklearn.utils import shuffle
from sklearn.preprocessing import MultiLabelBinarizer

# TensorFlow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model
from tensorflow_hub import KerasLayer
from tensorflow.keras.preprocessing import text
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import callbacks, layers, models, utils
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Embedding, Input, Flatten, Conv2D, MaxPooling2D

In [2]:
# User Authentication
auth.authenticate_user()

# BigQuery Library
# !pip install --upgrade google-cloud-bigquery

In [3]:
project_id = 'core-catalyst-425922-v9'
os.environ['GOOGLE_CLOUD_PROJECT'] = project_id

# BigQuery Client Config
client = bigquery.Client(project=project_id)

### ***Building a Multilabel Model with Simgoid Output***

In [4]:
!gsutil cp 'gs://ml-design-patterns/so_data.csv' .

Copying gs://ml-design-patterns/so_data.csv...
/ [0 files][    0.0 B/276.7 MiB]                                                ==> NOTE: You are downloading one or more large file(s), which would
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

- [1 files][276.7 MiB/276.7 MiB]                                                
Operation completed over 1 objects/276.7 MiB.                                    


In [5]:
data = pd.read_csv('so_data.csv', names=['tags', 'original_tags', 'text'], header=0)
data = data.drop(columns=['original_tags'])
data = data.dropna()

data = shuffle(data, random_state=22)
data.head()

Unnamed: 0,tags,text
182914,"tensorflow,keras",avocado image captioning model not compiling b...
48361,pandas,return excel file from avocado with flask in f...
181447,"tensorflow,keras",validating with generator (avocado) i'm trying...
66307,pandas,avocado multiindex dataframe selecting data gi...
11283,pandas,get rightmost non-zero value position for each...


In [6]:
tags_split = [tags.split(',') for tags in data['tags'].values]

print(tags_split[0])

['tensorflow', 'keras']


In [7]:
tag_encoder = MultiLabelBinarizer()
tags_encoded = tag_encoder.fit_transform(tags_split)
num_tags = len(tags_encoded[0])

print(data['text'].values[0][:110])
print(tag_encoder.classes_)
print(tags_encoded[0])

avocado image captioning model not compiling because of concatenate layer when mask_zero=true in a previous la
['keras' 'matplotlib' 'pandas' 'scikitlearn' 'tensorflow']
[1 0 0 0 1]


In [8]:
train_size = int(len(data) * .8)

print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data) - train_size))

Train size: 150559
Test size: 37640


In [9]:
train_tags = tags_encoded[:train_size]

test_tags = tags_encoded[train_size:]

In [10]:
train_qs = data['text'].values[:train_size]

test_qs = data['text'].values[train_size:]

In [11]:
VOCAB_SIZE=400

tokenizer = text.Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(train_qs)

body_train = tokenizer.texts_to_matrix(train_qs)
body_test = tokenizer.texts_to_matrix(test_qs)

In [12]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(50, input_shape=(VOCAB_SIZE,), activation='relu'))
model.add(tf.keras.layers.Dense(25, activation='relu'))
model.add(tf.keras.layers.Dense(num_tags, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 50)                20050     
                                                                 
 dense_1 (Dense)             (None, 25)                1275      
                                                                 
 dense_2 (Dense)             (None, 5)                 130       
                                                                 
Total params: 21455 (83.81 KB)
Trainable params: 21455 (83.81 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [14]:
model.fit(body_train, train_tags, epochs=3, batch_size=128, validation_split=0.1)

print('Eval loss/accuracy:{}'.format(
  model.evaluate(body_test, test_tags, batch_size=128)))

Epoch 1/3
Epoch 2/3
Epoch 3/3
Eval loss/accuracy:[0.10232413560152054, 0.8952444195747375]


### **Parsing Sigmoid Results**

In [15]:
predictions = model.predict(body_test[:3])



In [16]:
classes = tag_encoder.classes_

for q_idx, probabilities in enumerate(predictions):
  print(test_qs[q_idx])
  for idx, tag_prob in enumerate(probabilities):
    if tag_prob > 0.7:
      print(classes[idx], round(tag_prob * 100, 2), '%')
  print('')

i want to subtract each column from the previous non-null column using the diff function i have a long list of columns and i want to subtract the previous column from the current column and replace the current column with the difference.  so if i have:  a   b   c   d 1  nan  3   7 3  nan  8   10 2  nan  6   11   i want the output to be:  a   b   c   d  1  nan  2   4 3  nan  5   2 2  nan  4   5   i have been trying to use this code:  df2 = df1.diff(axis=1) but this does not produce the desired output  thanks in advance.
pandas 99.75 %

how to merge all csv files in a folder to single csv ased on columns? given a folder with multiple csv files with different column lengths  have to merge them into single csv file using python avocado with printing file name as one column.  input: https://www.dropbox.com/sh/1mbgjtrr6t069w1/aadc3zrrzf33qbil63m1mxz_a?dl=0  output:   id  snack      price    sheetname 5   orange      55     sheet1 7   apple       53     sheet1 8   muskmelon   33     sheet1 11

### **Sigmoid Output Binary Classification**

In [17]:
!gsutil cp gs://ml-design-patterns/mushrooms.csv .

Copying gs://ml-design-patterns/mushrooms.csv...
/ [1 files][365.2 KiB/365.2 KiB]                                                
Operation completed over 1 objects/365.2 KiB.                                    


In [18]:
mushroom_data = pd.read_csv('mushrooms.csv')
mushroom_data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [19]:
mushroom_data.loc[mushroom_data['class'] == 'p', 'class'] = 0

mushroom_data.loc[mushroom_data['class'] == 'e', 'class'] = 1

In [20]:
labels = mushroom_data.pop('class')

In [21]:
dummy_data = pd.get_dummies(mushroom_data)

In [22]:
# Data Splitting
train_size = int(len(mushroom_data) * .8)

train_data = dummy_data[:train_size]
test_data = dummy_data[train_size:]

train_labels = labels[:train_size]
test_labels = labels[train_size:]

In [23]:
model = keras.Sequential([
    keras.layers.Dense(32, input_shape=(len(dummy_data.iloc[0]),), activation='relu'),
    keras.layers.Dense(8, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

In [24]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 32)                3776      
                                                                 
 dense_4 (Dense)             (None, 8)                 264       
                                                                 
 dense_5 (Dense)             (None, 1)                 9         
                                                                 
Total params: 4049 (15.82 KB)
Trainable params: 4049 (15.82 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [25]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [26]:
model.fit(train_data.values.tolist(), train_labels.values.tolist())



<keras.src.callbacks.History at 0x7b2adc8ae470>

In [27]:
model.evaluate(test_data.values.tolist(), test_labels.values.tolist())



[0.1300443410873413, 0.9569230675697327]

In [28]:
def to_one_hot(data):
  if data == 0:
    return [1, 0]
  else:
    return [0,1]

In [29]:
train_labels_one_hot = train_labels.apply(to_one_hot)

test_labels_one_hot = test_labels.apply(to_one_hot)

In [30]:
model_softmax = keras.Sequential([
    keras.layers.Dense(32, input_shape=(len(dummy_data.iloc[0]),), activation='relu'),
    keras.layers.Dense(8, activation='relu'),
    keras.layers.Dense(2, activation='softmax')
])

In [31]:
model_softmax.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 32)                3776      
                                                                 
 dense_7 (Dense)             (None, 8)                 264       
                                                                 
 dense_8 (Dense)             (None, 2)                 18        
                                                                 
Total params: 4058 (15.85 KB)
Trainable params: 4058 (15.85 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [32]:
model_softmax.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [33]:
model_softmax.fit(train_data.values.tolist(), train_labels_one_hot.values.tolist())



<keras.src.callbacks.History at 0x7b2adc8278b0>

In [34]:
model_softmax.evaluate(test_data.values.tolist(), test_labels_one_hot.values.tolist())



[0.14723895490169525, 0.9366154074668884]