In [None]:
!pip install kaggle




In [None]:
from google.colab import files
files.upload()


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"jatinsharma1703","key":"a36e1b6fda7e387dda59825922d8ec1f"}'}

In [None]:
import os
import zipfile

os.makedirs(os.path.expanduser('~/.kaggle'), exist_ok=True)
os.replace('kaggle.json', os.path.expanduser('~/.kaggle/kaggle.json'))

os.chmod(os.path.expanduser('~/.kaggle/kaggle.json'), 0o600)




### Downloading dataset from kaggle using API key

In [None]:
!kaggle competitions download -c shl-hiring-assessment


shl-hiring-assessment.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
with zipfile.ZipFile('shl-hiring-assessment.zip', 'r') as zip_ref:
    zip_ref.extractall('shl-hiring-assessment')


### Importing the Dataset from folder

In [None]:
import pandas as pd

train_df = pd.read_csv("shl-hiring-assessment/Dataset/train.csv")
test_df = pd.read_csv("shl-hiring-assessment/Dataset/test.csv")
submission_df = pd.read_csv("shl-hiring-assessment/Dataset/sample_submission.csv")

In [None]:
!pip install -q openai-whisper
import whisper
import os


In [None]:
from tqdm import tqdm


In [None]:
model = whisper.load_model("base")
AUDIO_PATH_TRAIN = "shl-hiring-assessment/Dataset/audios/train"
AUDIO_PATH_TEST = "shl-hiring-assessment/Dataset/audios/test"


In [None]:
transcripts_train = []
transcripts_test = []

### Generating transcripts of the audio files using whisper base model

In [None]:
def get_transcripts(dataset,transcripts,AUDIO_PATH):
  for fname in tqdm(dataset['filename']):
    file_path = os.path.join(AUDIO_PATH, fname)
    try:
        result = model.transcribe(file_path)
        transcripts.append(result['text'].strip())
    except Exception as e:
        transcripts.append("[ERROR]")

In [None]:
get_transcripts(train_df,transcripts_train,AUDIO_PATH_TRAIN)
get_transcripts(test_df,transcripts_test,AUDIO_PATH_TEST)


100%|██████████| 444/444 [37:29<00:00,  5.07s/it]
100%|██████████| 204/204 [11:40<00:00,  3.44s/it]


In [None]:
train_df['transcript'] = transcripts_train
test_df['transcript'] = transcripts_test


### Saving the files containing transcript if needed to import in future

In [None]:
train_df.to_csv('merged_train_with_transcripts.csv', index=False)
test_df.to_csv('merged_test_with_transcripts.csv', index=False)


### Removing null values of the transcript ( less in number (1,2 out of total) so removed )

In [None]:
train_df = train_df[~train_df['transcript'].isnull()]
test_df = test_df[~test_df['transcript'].isnull()]

## Using Bert model to encode the transcriptions as it captures sequential meaning bidirectionally

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')

X_train_text = model.encode(train_df['transcript'].tolist(), show_progress_bar=True)
X_test_text = model.encode(test_df['transcript'].tolist(), show_progress_bar=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009457 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 113912
[LightGBM] [Info] Number of data points in the train set: 443, number of used features: 771
[LightGBM] [Info] Start training from score 3.969526




### 💡 POS Feature Extraction and Integration for Text Classification
### This section enhances text-based features by extracting linguistic patterns from transcripts.
### Specifically, we:
### - Use spaCy to count key Part-of-Speech (POS) tags (NOUN, VERB, ADJ, ADV, PROPN), which capture grammatical structure.
### - Represent each transcript as a fixed-length vector of POS tag counts.
### - Scale these features for consistency and better performance in machine learning models.
### - Combine the scaled POS features with precomputed text-based features (e.g., embeddings or TF-IDF)
###   to form a richer, more informative feature set for model training and prediction.



In [None]:
import spacy
from tqdm import tqdm
import xgboost as xgb
import numpy as np
from sklearn.preprocessing import StandardScaler
from spacy.symbols import NOUN, VERB, ADJ, ADV, PROPN

nlp = spacy.load("en_core_web_sm")

POS_TAGS = [NOUN, VERB, ADJ, ADV, PROPN]

def extract_pos_features(text):
    doc = nlp(text)
    pos_counts = doc.count_by(spacy.attrs.POS)
    return [pos_counts.get(tag, 0) for tag in POS_TAGS]

tqdm.pandas()
train_df['pos_features'] = train_df['transcript'].progress_apply(extract_pos_features)
test_df['pos_features'] = test_df['transcript'].progress_apply(extract_pos_features)

max_len = max(len(x) for x in train_df['pos_features'])
train_df['pos_features'] = train_df['pos_features'].apply(lambda x: x + [0]*(max_len - len(x)))
test_df['pos_features'] = test_df['pos_features'].apply(lambda x: x + [0]*(max_len - len(x)))

scaler = StandardScaler()
train_pos_scaled = scaler.fit_transform(np.vstack(train_df['pos_features']))
test_pos_scaled = scaler.transform(np.vstack(test_df['pos_features']))

X_train_combined = np.hstack([X_train_text, train_pos_scaled])
X_test_combined = np.hstack([X_test_text, test_pos_scaled])
y_train = train_df['label'].values

100%|██████████| 443/443 [00:15<00:00, 28.06it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['pos_features'] = train_df['transcript'].progress_apply(extract_pos_features)
100%|██████████| 204/204 [00:06<00:00, 33.46it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['pos_features'] = train_df['pos_features'].apply(lambda x: x + [0]*(max_len - len(x)))


### Experimented and got that these parameters work best for the model

In [None]:
reg = xgb.XGBRegressor(
    n_estimators=400,
    learning_rate=0.02,
    objective='reg:squarederror',
    booster='gbtree',
    colsample_bytree=0.8,
    subsample=0.8,
    max_depth=4,
    gamma=0,
)

reg.fit(X_train_combined, y_train)

y_test_pred = reg.predict(X_test_combined)

valid_scores = np.array([1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0])

def round_to_closest_score(predictions, valid_scores):
    idx = np.argmin(np.abs(valid_scores[:, None] - predictions), axis=0)
    return valid_scores[idx]

y_test_pred_rounded = round_to_closest_score(y_test_pred, valid_scores)

### uio.csv is same sample_submission.csv file but just given a nick name

In [None]:
sample=pd.read_csv('uio.csv')

In [None]:
sample['label']=y_test_pred_rounded

In [None]:
sample.to_csv('uio.csv',index=False)