In [10]:
import pandas as pd
import nltk  # https://realpython.com/nltk-nlp-python/#chunking
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from nltk import RegexpParser
nltk.download('omw-1.4')

# file_path = r'C:\Users\Jade Chen\URAP_India\conflict-india\data\acled_india_30may.csv'
acled_df = pd.read_csv('acled_india_csv.csv', delimiter=';')


# Display the first five rows and data types of each column
print(acled_df.head(5))
print(acled_df.dtypes)



# Display the 'notes' column
print(acled_df['notes'].head())

acled_df = acled_df.head(100)

print("Done")

[nltk_data] Downloading package omw-1.4 to /opt/conda/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


   data_id  iso event_id_cnty  event_id_no_cnty   event_date  year  \
0  7118851  356      IND74745             74745  30 May 2020  2020   
1  7118861  356      IND74746             74746  30 May 2020  2020   
2  7118864  356      IND74748             74748  30 May 2020  2020   
3  7118903  356      IND74739             74739  30 May 2020  2020   
4  7118904  356      IND74740             74740  30 May 2020  2020   

   time_precision event_type         sub_event_type              actor1  ...  \
0               1      Riots  Violent demonstration     Rioters (India)  ...   
1               1   Protests       Peaceful protest  Protesters (India)  ...   
2               1   Protests       Peaceful protest  Protesters (India)  ...   
3               1      Riots           Mob violence     Rioters (India)  ...   
4               1      Riots  Violent demonstration     Rioters (India)  ...   

    location   latitude  longitude geo_precision          source  \
0   Uluberia  22.475599  88.09

In [11]:


# Define a function to prepare text for NLP
def prep_text(text):
    if isinstance(text, str):
        # Tokenize the text into words
        words = word_tokenize(text)

        # Remove punctuation and convert to lowercase
        words = [word.lower() for word in words if word.isalnum()]

        # Remove stopwords using the "english" set of stopwords
        stop_words = set(stopwords.words("english"))
        words = [word for word in words if word not in stop_words]

        # Lemmatizing (reduce words to their core meaning)
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]

        # Stemming (reduce words to their root)
        stemmer = PorterStemmer()
        words = [stemmer.stem(word) for word in words]

        # Chunking (group words to identify phrases)
        # NOTE TO SELF: change chunk grammar regexes depending on project next steps
        grammar = r"""
            NP: {<DT>?<JJ>*<NN>} # Chunk noun phrases
            VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verb phrases
        """
        chunk_parser = RegexpParser(grammar)
        tree = chunk_parser.parse(nltk.pos_tag(words))  # parses the POS-tagged words into chunks

        # Chinking (excludes some pattern from chunks)
        chink_parser = RegexpParser("NP: {<.*>+} # Chink all else")
        tree = chink_parser.parse(tree)  # Use tree.draw() to visualize tree if needed

        # Convert the parsed tree back to a string
        cleaned_text = ' '.join([word for subtree in tree for word in subtree if isinstance(word, str)])

        return cleaned_text

    # If text is NaN, return an empty string
    else:
        return ''


# Apply the prep_text function to the 'notes' column and store in new 'cleaned_notes' column
acled_df['cleaned_notes'] = acled_df['notes'].apply(prep_text)

# Display the 'cleaned_notes' and 'notes' column
print(acled_df[['notes', 'cleaned_notes']].head())

                                               notes cleaned_notes
0  On 30 May 2020, two groups of people clashed w...              
1  On 30 May 2020, local residents staged protest...              
2  On 30 May 2020, the local residents staged pro...              
3  On 30 May 2020, members of Trinamool Congress ...              
4  On 30 May 2020, local residents staged a demon...              
