# Importing all necessary libraries

In [None]:
import nltk
import re

from nltk.corpus import stopwords
from torch.nn.functional import softmax
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Function for preprocessing the data 

In [None]:
# Define preprocessing function
def preprocessing_text(text):
    
    # Make Lower case all character in text
    text = text.lower()   
    
    # Removes non alphanumeric and spaces
    text = re.sub(r'[^\w\s]', ' ', text)                             
    
    # Tokenizing words
    tokens = word_tokenize(text)
    
     # Obtaining stop words like ‘a’, ‘the’ etc.
    stop_words = set(stopwords.words('english')) 
    
    # Removing stop words lie (a, the, is)
    tokens = [word for word in tokens if word not in stop_words]    
    
    # Obtaining base of words like ‘happy’ and not ‘happiest’
    lemmatizer = WordNetLemmatizer()
    
    # lemmatizing each word.
    tokens = [lemmatizer.lemmatize(word) for word in tokens]        
    
    # Joining and returning
    preprocessed_text = ' '.join(tokens)                            
    
    return preprocessed_text



In [None]:
# applying preprocessing on docstring
data['tokenized_docstring']=data['docstring'].apply(preprocessing_text)

# applying preprocessing on code
data['tokenized_code']=data['code'].apply(preprocessing_text)

In [None]:
data.head()

# Function for checking the language of the text

In [None]:

def detect_language(text):
    
    try:
        lang = detect(text)
        
        return lang
    
    except:
        
        return None
    


# Removing all rows whose language is not english

In [None]:
# Apply language detection to each text entry in the DataFrame
data['language'] = data['tokenized_docstring'].apply(detect_language)

# Filter out non-English entries
data = data[data['language'] == 'en'].reset_index(drop=True)

# Drop the language column as it’s no longer needed
data.drop(columns=['language'], inplace=True)

# Removing  all rows whose tokenized docstring has length less than 4

In [None]:
data = data[data['tokenized_docstring'].apply(lambda x: len(x.split()) > 3)]

In [None]:
data.shape

# Saving the data

In [None]:
data.to_csv('processed_data.csv', index=False)