# ***Libraries***

In [5]:
import numpy as np
import pandas as pd
import pyarabic.araby as ar

import re , string

from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer , TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.utils import resample

from huggingface_hub import notebook_login

In [2]:

notebook_login()

In [6]:
repo_name = "Predict Arabic dialects"

## Load and Explore data

In [7]:
train = pd.read_csv("/kaggle/input/aim-technologies-predict-the-dialectal-arabic/dialect_dataset.csv")

train_df = train.copy()

In [8]:
train.info()

In [9]:
# Prepare data before fetching process


train['id'] = train['id'].astype(str)
ID = train['id'].tolist()
len(ID)

## Data fetching

In [10]:
import requests

URL = "https://recruitment.aimtechnologies.co/ai-tasks"

headers= {
    
    "Accept": "application/json",
    "Content-Type": "application/json"
}


In [11]:
reslist = []
start = 0
end = 1000

for i in range(0,len(ID),1000):
    
        
    if end > len(ID):
        
        break 
    
    lst =[]
        
    for x in ID[start:end]:
                
        lst.append(x)
            
    response = requests.post(URL, json = lst, headers= headers).json()
    
    reslist.append(response)

 
                           
    start = start+1000
    
    end = end + 1000  

In [12]:
reslist2 = []
lst = []
for x in ID[458000:]:
                
    lst.append(x)
            
response1 = requests.post(URL, json = lst, headers= headers).json()
reslist2.append(response1)

In [13]:
text = reslist + reslist2

## Save data as a dataframe

In [14]:
df = pd.DataFrame(text).stack().apply(pd.Series)

In [15]:
df = df.reset_index()

## Rename column after reseting index

In [16]:
df = df.rename(columns={0: 'text', 'level_1': 'id'})

In [17]:
df = df.drop(['level_0'], axis=1)

## Merge data 

In [18]:
df = pd.merge(train, df, on="id")

In [19]:
df

## Data Preprocessing

In [20]:
# check nulls 
df.isnull().sum()

In [21]:
df.duplicated().sum()

In [22]:
df['dialect'].nunique()

In [23]:
df['dialect'].value_counts()

In [24]:
#df = pd.get_dummies(data=df, columns=['dialect'])

In [25]:
#df


In [26]:
# lowercase names of col
df.rename(columns=lambda x: x.strip().lower(), inplace=True)

In [27]:
df

In [28]:
null_values = ['unknown','missing','?','','NULL','NaN']

In [29]:
df = df.replace(null_values, np.NaN)

In [30]:
df.isnull().sum()

## Cleaning data ( Punctuation , mentions , emotion .. etc )

In [31]:
df['text']= df['text'].map(lambda text: re.sub(r'[^\u0600-\u06ff\u0750-\u077f\ufb50-\ufbc1\ufbd3-\ufd3f\ufd50-\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD]+', ' ', text).strip())

In [32]:
df['text']

In [33]:
df.to_csv('data.csv')  

In [34]:
#df = pd.read_csv('data.csv')

In [35]:
#df['id'] = df['id'].astype(int)

In [36]:
df_set = df.copy()

In [None]:
df_set = df_set.drop(['id'], axis=1 )


In [38]:
df_set

# Model Training

## Predict by Using Linear svm (Machine Learning)

In [39]:
# split training data to train_set and validation_set

train_set, val_set = train_test_split(
    df_set, test_size= .10, random_state= 42 
)

print("Train set: ")
print(train_set['dialect'].value_counts())
print("---------------------------")
print ("val set: ")
print (val_set['dialect'].value_counts())

In [40]:
x = train_set['text'].to_list()
y = train_set['dialect'].to_list()
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.0005, random_state=42)

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,8),  analyzer='char' ,  max_df=0.999999999, min_df=1, sublinear_tf=True, use_idf=True, norm='l2' )
train_vectors = vectorizer.fit_transform(x_train)
val_vectors = vectorizer.transform(x_val)
print("Shape o# training data : ", train_vectors.shape, "\nShape of validation data : ", val_vectors.shape)

In [None]:
from sklearn.svm import LinearSVC

model_LR = LinearSVC(max_iter =500 ,class_weight= 'balanced', random_state=42);

model_LR.fit(train_vectors, y_train)

In [None]:
def performance(model, y_true, vectors):
    from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
    print("Predicting the sentiments...")
    y_pred = model.predict(vectors)  # y_pred_RF = model_RF.predict(val_vectors)
    df = pd.DataFrame({'actual': y_true, 'predicted': y_pred})
    print("\nAnalysis after prediction : \n")
    d = df['predicted'].value_counts(normalize=True) * 100  # series
    print(d)
    ## plot for analysis
    
    cm = confusion_matrix(y_true, y_pred)
    crp = classification_report(y_true, y_pred,digits=3)
    acc = accuracy_score(y_true, y_pred)
    return (cm, crp, acc)

In [None]:
perform = performance(model_LR, y_val, val_vectors)
print("Confusion Matrix :\n", perform[0])
print("classification report: \n", perform[1])
print("Accuracy score  = ", perform[2] * 100)
print("-" * 100)

In [None]:
test_vectors = vectorizer.transform(val_set['dialect'])

In [None]:
predicted = model_LR.predict(test_vectors)

print("Results : \n")

results = pd.DataFrame({'text' : val_set['text'], 'dialect' : predicted},
                       columns = ['text', 'dialect'])

results['dialect'] 
results.to_csv("outputs.csv", sep= ",", index = False)

In [None]:
out = pd.read_csv('outputs.csv')

In [None]:
out.head(50)

## Predict by Using Bert (Deep learning)

In [None]:
import torch
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.  

    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    !nvidia-smi
    
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
# a class representing the dataset
class Dataset:
    def __init__(
        self,
        name,
        train,
        test,
        label_list,
    ):
        self.name = name
        self.train = train
        self.test = test
        self.label_list = label_list

In [None]:
class BERTModelDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
        super(BERTModelDataset).__init__()
        self.text = text
        self.target = target
        self.tokenizer_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.max_len = max_len
        self.label_map = label_map
  
    def __len__(self):
        return len(self.text)

    def __getitem__(self,item):
        text = str(self.text[item])
        text = " ".join(text.split())
    
        encoded_review = self.tokenizer.encode_plus(
        text,
        max_length= self.max_len,
        add_special_tokens= True,
        return_token_type_ids=False,
        pad_to_max_length=True,
        truncation='longest_first',
        return_attention_mask=True,
        return_tensors='pt'
      )
        input_ids = encoded_review['input_ids'].to(device)
        attention_mask = encoded_review['attention_mask'].to(device)

        return InputFeatures(input_ids=input_ids.flatten(), attention_mask=attention_mask.flatten(), label=self.label_map[self.target[item]])

In [None]:
# prepare for Training

Model_Used = "UBC-NLP/MARBERT"
Task_Name = "classification"

Max_Len = 0 
Extra_Len = 6

label_list = list(train_set['dialect'].unique())

print(label_list)

print(train_set['dialect'].value_counts())

data_set = Dataset( "AIMTech", train_set, val_set, label_list )

label_map = { v:index for index, v in enumerate(label_list) }

print(label_map)

train_dataset = BERTModelDataset(train_set['dialect'].to_list(),
                                 train_set['dialect'].to_list(),Model_Used,Max_Len,label_map)

evaluation_dataset = BERTModelDataset(val_set['dialect'].to_list(),
                                     val_set['dialect'].to_list(),Model_Used,Max_Len,label_map)

In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(Model_Used, return_dict=True, num_labels=len(label_map))

def compute_metrics(p): #p should be of type EvalPrediction
    preds = np.argmax(p.predictions, axis=1)
    assert len(preds) == len(p.label_ids)
    print(classification_report(p.label_ids,preds))
    #print(confusion_matrix(p.label_ids,preds))

    macro_f1_pos_neg = f1_score(p.label_ids,preds,average='macro',labels=[1,2])
    macro_f1 = f1_score(p.label_ids,preds,average='macro')
    macro_precision = precision_score(p.label_ids,preds,average='macro')
    macro_recall = recall_score(p.label_ids,preds,average='macro')
    acc = accuracy_score(p.label_ids,preds)
    return {
        'macro_f1' : macro_f1,
        'macro_f1_pos_neg' : macro_f1_pos_neg,  
        'macro_precision': macro_precision,
        'macro_recall': macro_recall,
        'accuracy': acc
           }

def set_seed(seed):
    
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
training_args = TrainingArguments("./train")
training_args.lr_scheduler_type = 'cosine'
training_args.evaluate_during_training = True
training_args.adam_epsilon =1e-8 
training_args.learning_rate =  1.215e-05
training_args.fp16 = True
#1.215e-05 # best score 5-6-2021
training_args.per_device_train_batch_size = 16 #64 #69
training_args.per_device_eval_batch_size = 16 # 64 #69
training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= 2
training_args.warmup_steps = 0 
training_args.evaluation_strategy = EvaluationStrategy.EPOCH
training_args.logging_steps = 200
training_args.save_steps = 100000 #don't want to save any model, there is probably a better way to do this :)
training_args.seed = 42 #42 #was 84 #42 #42 #123 # 666 #0 #42
training_args.disable_tqdm = False
training_args.output_dir=repo_name

In [None]:
# prepare for Training
label_list = list(train_set['dialect'].unique())

print(label_list)
print(train_set['dialect'].value_counts())

data_set = Dataset( "AIMTech", train_set, val_set, label_list )

label_map = { v:index for index, v in enumerate(label_list) }
print(label_map)

train_dataset = BERTModelDataset(train_set['text'].to_list(),
                                 train_set['dialect'].to_list(),Model_Used,Max_Len,label_map)

evaluation_dataset = BERTModelDataset(val_set['text'].to_list(),
                                      eval_set['dialect'].to_list(),Model_Used,Max_Len,label_map)


In [None]:

trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset= evaluation_dataset,
    compute_metrics=compute_metrics
)

In [None]:
print(Max_Len)
print(training_args.learning_rate)
#print(train_data_file)
print(training_args.adam_epsilon)
print(training_args.warmup_steps)

trainer.train()

In [None]:
trainer.push_to_hub()