In [None]:
!pip install transformers==4.27.0
!pip install mlflow

In [None]:
device ="cuda:0"

In [None]:
import torch
from transformers import CLIPModel, CLIPProcessor


class CLIPFeatureExtractor:
    def __init__(self):
        model_name = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
        self.model = CLIPModel.from_pretrained(model_name)
        self.processor = CLIPProcessor.from_pretrained(model_name)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)

    @torch.no_grad()
    def get_text_features(self, text):
        inputs = self.processor(text=text, return_tensors="pt")
        inputs = inputs.to(self.device)
        text_features = self.model.get_text_features(**inputs)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        text_features = text_features.tolist()
        return text_features

    @torch.no_grad()
    def get_image_features(self, images):
        inputs = self.processor(images=images, return_tensors="pt")
        inputs = inputs.to(self.device)
        image_features = self.model.get_image_features(**inputs)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        image_features = image_features.detach().cpu().numpy()
        return image_features

from PIL import Image
import requests

processor = CLIPFeatureExtractor()

In [None]:
from sklearn.exceptions import UndefinedMetricWarning

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset
from PIL import Image
import os
import pandas as pd
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

transform_augment_train = transforms.Compose(
    [transforms.Resize((230,230)),
        transforms.RandomApply([transforms.RandomRotation(30,),],p=0.6),
        transforms.RandomCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.RandomErasing(p=0.3,scale=(0.02, 0.33), ratio=(0.3, 3.3)),
    ])

transform = transforms.Compose(
    [transforms.Resize((224,224)),     
    ])



In [None]:
id2class = ["land slide","drought","urban fire","infrastructure","flooding","earthquake","wild fire"]

path2folder = ["/kaggle/input/disaster-images-dataset/Comprehensive Disaster Dataset(CDD)/Land_Disaster/Land_Slide",
              "/kaggle/input/disaster-images-dataset/Comprehensive Disaster Dataset(CDD)/Land_Disaster/Drought",
               "/kaggle/input/disaster-images-dataset/Comprehensive Disaster Dataset(CDD)/Fire_Disaster/Urban_Fire",
               "/kaggle/input/disaster-images-dataset/Comprehensive Disaster Dataset(CDD)/Damaged_Infrastructure/Infrastructure",
               "/kaggle/input/disaster-images-dataset/Comprehensive Disaster Dataset(CDD)/Water_Disaster",
               "/kaggle/input/disaster-images-dataset/Comprehensive Disaster Dataset(CDD)/Damaged_Infrastructure/Earthquake",
               "/kaggle/input/disaster-images-dataset/Comprehensive Disaster Dataset(CDD)/Fire_Disaster/Wild_Fire",
              ]

In [None]:
from mlflow import MlflowClient

import os 
os.environ['GOOGLE_APPLICATION_CREDENTIALS']='/kaggle/input/datasci-key-storage/krian-mai-krian-proj-386109-169baed358e1.json'


client = MlflowClient("http://34.142.181.201:5000", "http://34.142.181.201:5000")
print("Running mlflow_tracking.py")

experiment_id = "1"


In [None]:
image_path = []
labels = []
from glob import glob
for i in range(len(id2class)):
    paths = glob(path2folder[i]+"/*")
    image_path+=paths
    labels+=[i]*len(paths)

In [None]:
import random 
from tqdm import tqdm
random.seed(0)

df= pd.DataFrame({"filename":image_path,"class":labels})

In [None]:
from sklearn.model_selection import train_test_split

X=df["filename"].tolist()
y=df["class"].tolist()

train_images_path, val_images_path, train_labels, val_labels = train_test_split(X, y, test_size=0.1, random_state=0,stratify=y)

In [None]:
err_idx = [131,
 196,
 370,
 872,
 1088,
 1260,
 1830,
 2830,
 2842,
 3210,
 3415,
 4541,
 4608,
 4770,
 4786,
 4797,
 4991,
 5065,
 5118,
 5144,
 5245,
 5446,
 5496,
 5651,
 5933,
 5938,
 5988,
 6254,
 6330,
 6369,
 6488,
 6606,
 6742,
 7062,
 7750,
 7878,
 7973,
 7978,
 7979,
 7988,
 8187,
 8196,
 8211,
 8215,
 8266,
 8270,
 8454,
 8459,
 8476,
 8605]

In [None]:
import numpy as np
import pandas as pd
np.random.seed(0)
print(len(train_images_path),len(train_labels))
expected_number = 1500

df = pd.DataFrame({"path":train_images_path,"class":train_labels})

class_number_add = dict(expected_number-df["class"].value_counts())
df =df.groupby("class",as_index=False).agg(list)
for i in class_number_add:
    if class_number_add[i]>0:
        train_images_path+=list(np.random.choice(df["path"][i],class_number_add[i]))
        train_labels+=[i for idx in range(class_number_add[i])]
print(len(train_images_path),len(train_labels))
print(pd.Series(train_labels).value_counts())

from tqdm import tqdm
from torchvision.io import read_image , ImageReadMode

class FondueDataset(Dataset):
    
    def __init__(self, 
                 images_path,
                 labels, 
                 transforms=None,
                 augment_transform=None):
        
        super().__init__()
        self.input_dataset=[]
        self.transforms = transforms
        self.augment_transforms = augment_transform
        idx = 0
        for path,label in tqdm(zip(images_path,labels)):
            if idx not in err_idx:
                self.input_dataset.append([path,label])
            idx+=1
        
            
    def __len__(self):
        return len(self.input_dataset)

    def __getitem__(self, idx): 
        # img = Image.open(self.input_dataset[idx][0]).convert('RGB')
        img = read_image(self.input_dataset[idx][0],ImageReadMode.RGB)/255
        if not self.augment_transforms:
            x = self.transforms(img)
        else:
            x= self.augment_transforms(img)
        y = self.input_dataset[idx][1]
        return torch.clamp(x,min=0.0,max=1.0),y

trainset = FondueDataset(train_images_path,train_labels,transform_augment_train)
valset = FondueDataset(val_images_path,val_labels,transform)

In [None]:
import os
import torch

import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader
from tqdm import tqdm

from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

def get_features(dataset):
    all_features = []
    all_labels = []
    
    with torch.no_grad():
        for images, labels in tqdm(DataLoader(dataset, batch_size=256)):
            
            features = processor.get_image_features(images)
            all_features.append(torch.Tensor(features))
            all_labels.append(labels)
    return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()

# Calculate the image features
train_features, train_labels = get_features(trainset)
test_features, test_labels = get_features(valset)


In [None]:
torch.save(train_features,"train_feat.pt")
torch.save(test_features,"test_feat.pt")
torch.save(train_labels,"train_labels.pt")
torch.save(test_labels,"test_labels.pt")

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.001,0.01,0.1,0.5,1,5,10]}
classifier = LogisticRegression(random_state=0, max_iter=1000)
grid_search = GridSearchCV(classifier, param_grid, cv=5)

grid_search.fit(train_features, train_labels)

print("Best Parameters: ", grid_search.best_params_)

In [None]:
from sklearn.metrics import classification_report

classifier = LogisticRegression(random_state=0, C=10,max_iter=1000)

classifier.fit(train_features, train_labels)
val_predictions = classifier.predict(test_features)
f1=classification_report(test_labels,val_predictions,output_dict=True)["macro avg"]["f1-score"]


In [None]:
run = client.create_run(experiment_id,run_name="DisasterClassifierLR")
print(run.info.run_id)
client.log_param(run.info.run_id, key="C", value=grid_search.best_params_["C"])
client.log_metric(run.info.run_id, "macro-f1", f1)

if not os.path.exists("model"):
    os.makedirs("model")

import pickle 
filename = '/kaggle/working/model/model.sav'
pickle.dump(classifier, open(filename, 'wb'))

local_artifacts_path = "/kaggle/working/model"
remote_artifacts_path = "model"
client.log_artifacts(run.info.run_id, local_artifacts_path, artifact_path=remote_artifacts_path)