# Importing necessary libraries

In [1]:
import pandas as pd
import os
import sys

# Importing data_preprocessor class from data_cleaner.py  and loading_data class from data_loader.py in the src folder

In [2]:
# Retrieving the current working directory 
current_directory = os.getcwd()

# getting the parent directory of the current working directory
parent_directory = os.path.dirname(current_directory)

# Constructing a path to the "Data" directory located inside a directory named "src"
src_data_directory = os.path.join(parent_directory, "src", "Data")
print(src_data_directory)

# allowing Python to search for modules in this directory.
sys.path.append(src_data_directory)

#Importing the data_preprocessor class from the data_cleaner module located in the src -> Data.
from data_cleaner import data_preprocessor

#Importing the loading_data class from the data_loader module located in the src -> Data.
from data_loader import loading_data


/Users/msaqib/Second_project/src/Data


# Creating an object of the data_preprocessor class

In [3]:
data_object = data_preprocessor()

#checking object is created and getting the address of it
data_object


<data_cleaner.data_preprocessor at 0x142439b90>

# Creating an object of the loading_data class

In [4]:
load_object=loading_data()

#checking object is created and getting the address of it
load_object

<data_loader.loading_data at 0x14501a550>

# Getting the path for interim_data file using function of loading_data class

In [5]:
interim_file_path =load_object.get_file_path("interim_data.csv","Data")

print(interim_file_path)

/Users/msaqib/Second_project/Data/interim_data.csv


# Reading the interim data

In [6]:
data=pd.read_csv(interim_file_path)

In [7]:
#shape of data
data.shape

(150000, 2)

# Applying preprocessing on docstring and code 

In [7]:
# applying preprocessing on docstring with the help of function in data_cleaner.py
data['tokenized_docstring']=data['docstring'].apply(data_object.preprocessing_text)

# applying preprocessing on code with the help of function in data_cleaner.py
data['tokenized_code']=data['code'].apply(data_object.preprocessing_text)

In [8]:
data.head()

Unnamed: 0,docstring,code,tokenized_docstring,tokenized_code
0,Multiprocessing target for the zmq queue device,def zmq_device(self):\n '''\n Mu...,multiprocessing target zmq queue device,def zmq_device self multiprocessing target zmq...
1,Cleanly shutdown the router socket,def close(self):\n '''\n Cleanly...,cleanly shutdown router socket,def close self cleanly shutdown router socket ...
2,Pre-fork we need to create the zmq router devi...,"def pre_fork(self, process_manager):\n ...",pre fork need create zmq router device param f...,def pre_fork self process_manager pre fork nee...
3,Starts ZMQ monitor for debugging purposes.\n ...,def _start_zmq_monitor(self):\n '''\n ...,start zmq monitor debugging purpose return,def _start_zmq_monitor self start zmq monitor ...
4,After forking we need to create all of the loc...,"def post_fork(self, payload_handler, io_loop):...",forking need create local socket listen router...,def post_fork self payload_handler io_loop for...


# Removing all rows whose language is not english with the function from data_cleaner.py file

In [9]:
# Apply language detection to each text entry in the DataFrame with the help of function in data_cleaner.py
data['language'] = data['tokenized_docstring'].apply(data_object.detect_language)

# Filter out non-English entries and getting only those rows which has english as a language
data = data[data['language'] == 'en'].reset_index(drop=True)

# Drop the language column as it’s no longer needed
data.drop(columns=['language'], inplace=True)

# Removing  all rows whose tokenized docstring has length less than 4

In [10]:
data = data[data['tokenized_docstring'].apply(lambda x: len(x.split()) > 3)]

In [11]:
data.shape

(113799, 4)

In [12]:
data.head()

Unnamed: 0,docstring,code,tokenized_docstring,tokenized_code
0,Cleanly shutdown the router socket,def close(self):\n '''\n Cleanly...,cleanly shutdown router socket,def close self cleanly shutdown router socket ...
1,Pre-fork we need to create the zmq router devi...,"def pre_fork(self, process_manager):\n ...",pre fork need create zmq router device param f...,def pre_fork self process_manager pre fork nee...
2,After forking we need to create all of the loc...,"def post_fork(self, payload_handler, io_loop):...",forking need create local socket listen router...,def post_fork self payload_handler io_loop for...
3,Handle incoming messages from underlying TCP s...,"def handle_message(self, stream, payload):\n ...",handle incoming message underlying tcp stream ...,def handle_message self stream payload handle ...
4,Bind to the interface specified in the configu...,"def _publish_daemon(self, log_queue=None):\n ...",bind interface specified configuration file,def _publish_daemon self log_queue none bind i...


# Saving the  processed data in the Data folder with function of data_loading class 

In [8]:
processed_data_path=load_object.get_file_path("processed_data.csv","Data")

# saving the file
data.to_csv(processed_data_path, index=False)