In [None]:
import numpy as np
import pandas as pd

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:


import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def explore_data(df_train, df_test):

    # Display basic information about train and test dataframes
    print("Train Data:")
    print(df_train.head())

    print("\nTest Data:")
    print(df_test.head())

    # Drop rows with missing 'text' values in the train set
    df_train.dropna(subset=['text'], inplace=True)

    # Count the occurrences of each target value
    target_value_counts = df_train['target'].value_counts()
    print("Count of Target 1:", target_value_counts[1])
    print("Count of Target 0:", target_value_counts[0])

    # Create a new column 'count_words' to store the word count of each text
    df_train['count_words'] = df_train['text'].str.split().str.len()

    sns.histplot(df_train['count_words'], bins=range(1, 35, 2))

# Adjust the file paths accordingly
train_file_path = '../input/speechify/data/train.csv'
test_file_path = '../input/speechify/data/test.csv'
df = pd.read_csv(train_file_path)
df_test = pd.read_csv(test_file_path)
explore_data(df, df_test)



In [None]:
from sklearn.model_selection import train_test_split

def preprocess_and_split_data(df, df_test, test_size=0.2, random_state=42):
    # Extracting features and target
    X = df['text']
    y = df['target']
    X_test = df_test['text']

    # Splitting the data into training and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, test_size=test_size, random_state=random_state)

    return X_train, X_valid, y_train, y_valid, X_test

# Assuming df and df_test are defined
X_train, X_valid, y_train, y_valid, X_test = preprocess_and_split_data(df, df_test)
print(X_train)
print(df_test)

In [None]:
# Consider LSTM to account for vanishing gradient problem and since it is a NLP problem
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, Dropout, LSTM
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

def build_model(MAX_WORD_LENGTH):
    model = Sequential()
    model.add(keras.Input(shape=(MAX_WORD_LENGTH,)))
    model.add(keras.layers.Embedding(len(tokenizer.word_index), 64))
    model.add(keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
    model.add(keras.layers.Dense(64, activation='relu'))
    model.add(keras.layers.Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(loss=keras.losses.BinaryCrossentropy(), optimizer=Adam(), metrics=['accuracy'])
    return model

In [None]:
MAX_WORD_LENGTH = 35
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

def tokenize_and_pad_data(X_train, X_test, max_word_length):
    tokenizer = Tokenizer(num_words=max_word_length)
    tokenizer.fit_on_texts(X_train) 
    sequences_train = tokenizer.texts_to_sequences(X_train)
    X_train_padded = pad_sequences(sequences_train, maxlen=max_word_length) 

    sequences_test = tokenizer.texts_to_sequences(X_test)
    X_test_padded = pad_sequences(sequences_test, maxlen=max_word_length)

    return X_train_padded, X_test_padded, tokenizer

# Assuming X_train, X_test, and y_train are defined elsewhere
X_train_padded, X_test_padded, tokenizer = tokenize_and_pad_data(X_train, X_test, MAX_WORD_LENGTH)

# Build and train the model
model = build_model(MAX_WORD_LENGTH)
history = model.fit(X_train_padded, y_train, batch_size=32, epochs=15, verbose=2, validation_split=0.2)

In [None]:
df_sample = pd.read_csv('../input/speechify/data/sample_submission.csv')
print(df_sample)

df_sample['target'] = np.where(model.predict(X_test_padded)>=0.5, 1, 0)
print(df_sample)


In [None]:
from sklearn.metrics import classification_report
tokenizer = Tokenizer(num_words=MAX_WORD_LENGTH)
tokenizer.fit_on_texts(X_valid) 
valid_sequences = tokenizer.texts_to_sequences(X_valid)
X_valid_padded = pad_sequences(valid_sequences, maxlen=MAX_WORD_LENGTH) 
y_valid_estimation = np.where(model.predict(X_valid_padded)>=0.5, 1, 0)
print(classification_report(y_valid, y_valid_estimation))


In [None]:


from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

def train_with_stratified_kfold(X, y, build_model_func, MAX_WORD_LENGTH, n_splits=10, epochs=40):
    f1_score_kfolds = []
    y_test_results = []
    
    skfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    for train_ix, valid_ix in skfold.split(X, y):
        X_train, X_valid = X[train_ix], X[valid_ix]
        y_train, y_valid = y[train_ix], y[valid_ix]

        tokenizer = Tokenizer(num_words=MAX_WORD_LENGTH)
        tokenizer.fit_on_texts(X_train) 
        sequences_train = tokenizer.texts_to_sequences(X_train)
        X_train_padded = pad_sequences(sequences_train, maxlen=MAX_WORD_LENGTH) 

        sequences_valid = tokenizer.texts_to_sequences(X_valid)
        X_valid_padded = pad_sequences(sequences_valid, maxlen=MAX_WORD_LENGTH) 

        sequences_test = tokenizer.texts_to_sequences(X_test)
        X_test_padded = pad_sequences(sequences_test, maxlen=MAX_WORD_LENGTH) 

        model = build_model_func(MAX_WORD_LENGTH)
        history = model.fit(X_train_padded, y_train, batch_size=32, epochs=epochs, verbose=2, validation_split=0.2)
        
        y_valid_estimation = np.where(model.predict(X_valid_padded) >= 0.5, 1, 0)
        f1 = f1_score(y_valid, y_valid_estimation)
        print("F1 Score is", f1)
        f1_score_kfolds.append(f1)
        
        y_test_results.append(model.predict(X_test_padded))

    return f1_score_kfolds, y_test_results

X = df['text']
y = df['target']
f1_scores, y_test_results = train_with_stratified_kfold(X, y, build_model, MAX_WORD_LENGTH)

In [None]:
print(f1_scores)
df_sample = pd.read_csv('../input/speechify/data/sample_submission.csv')

df_sample['target'] = np.where(np.mean(y_test_results, axis=0)>=0.5, 1, 0)
print(df_sample)
