In [None]:
# Using regex to match some text patterns

import pandas as pd
import re

# Load the dataset
input_file = "Kenny_claudeclassification.csv"  # Replace with your actual file path
df = pd.read_csv(input_file)

# Define a function to identify company-related comments
def identify_company_comment(comment):
    # Define company-related keywords and patterns
    company_patterns = [
        r'working for [A-Za-z\s]+',  # Matches phrases like "working for [company]"
        r'as part of [A-Za-z\s]+',   # Matches "as part of [team]"
        r'updated for [A-Za-z\s]+',  # Matches "updated for [project]"
        r'\b(company|team|project|office)\b',  # General keywords
        # Add known company names or more specific terms here
    ]

    # Check if any pattern matches the comment
    for pattern in company_patterns:
        if re.search(pattern, comment, re.IGNORECASE):
            return 1  # Label as company-related
    return 0  # Label as individual-related

# Apply the function to the comments column
df['company_related'] = df['comment'].apply(identify_company_comment)

# Display the updated DataFrame
print(df.head())

         id                                          comment  new  technical  \
0  44894774                New road construction in progress    1          0   
1  44914065                                      Added house    1          0   
2  44967243                    (node) - added [tag=website]}    1          0   
3  45147457  Aligning or naming imported tiger roads #to-fix    0          1   
4  45147673                                highways modified    0          0   

   local  correctional  company_related  
0    1.0           0.0                0  
1    0.0           0.0                0  
2    0.0           0.0                0  
3    0.0           1.0                0  
4    0.0           1.0                0  


Since this approach may not be 100% accurate, I manually glanced over a subset of the results to refine the patterns and use them to train a more sophisticated model.

Approach:
1. Identify Hashtags: Search for hashtags in the comments.
2. Keyword Matching: Look for known company names or related terms.
3. Label Company-Related Comments.

I'll create a function to extract and label comments with company-related hashtags or mentions.






In [None]:

def identify_company_related_comment(comment):
    # Define company-related hashtags or keywords
    company_keywords = [
        r'#BlackRock', r'#Google', r'#Microsoft', r'#Amazon', r'#Tesla',
        r'BlackRock', r'Google', r'Microsoft', r'Amazon', r'Tesla',
        # Add more known company-related terms or hashtags here
    ]

    # Check if any keyword or hashtag is found in the comment
    for keyword in company_keywords:
        if re.search(keyword, comment, re.IGNORECASE):
            return 1  # Label as company-related
    return 0  # Label as individual-related

# Apply the function to the comments column
df['company_related'] = df['comment'].apply(identify_company_related_comment)

# Display a few rows of the updated DataFrame
df[['id', 'comment', 'company_related']].head()

Unnamed: 0,id,comment,company_related
0,44894774,New road construction in progress,0
1,44914065,Added house,0
2,44967243,(node) - added [tag=website]},0
3,45147457,Aligning or naming imported tiger roads #to-fix,0
4,45147673,highways modified,0


The initial scan has labeled the comments with 0 (individual-related) or 1 (company-related) based on the presence of company-related hashtags or terms. It seems that the sample shown doesn't contain matches, so I'll check if there are any comments labeled as company_related in the dataset.

Let's display some examples of comments that have been labeled as company_related for further inspection.​​




In [None]:
# Display comments labeled as company-related for inspection
company_related_comments = df[df['company_related'] == 1]
company_related_comments[['id', 'comment']].head()

Unnamed: 0,id,comment
180,58224415,Added buildings in #Douentza #Mali #hotosm-pro...
366,68881170,Added buildings in #Moissala #Chad #hotosm-pro...
1620,107150035,#19096 Inclusion of Street Names in the City o...
1702,109378486,Building/Roads Mapping in #MinamiSanriku #hoto...
1939,121517229,#hotosm-project-10970 Added buildings #mapbek...


There are comments that have been labeled as company_related due to the presence of hashtags or company-related terms:

- Comment 180: "Added buildings in #Douentza #Mali #hotosm-project"
- Comment 366: "Added buildings in #Moissala #Chad #hotosm-project"
- Comment 1620: "#19096 Inclusion of Street Names in the City of..."
- Comment 1702: "Building/Roads Mapping in #MinamiSanriku #hotosm-project"
- Comment 1939: "#hotosm-project-10970 Added buildings #mapbek"

These comments include hashtags or keywords that suggest they may be related to organized projects or professional use.



Let's try expanding the keyword list to capture a wideranding companies

In [None]:
company_keywords = [
    # Large corporations
    r'#BlackRock', r'#Google', r'#Microsoft', r'#Amazon', r'#Tesla',
    r'#Apple', r'#Facebook', r'#Meta', r'#IBM', r'#Oracle',
    r'#Uber', r'#Lyft', r'#Airbnb', r'#Zoom', r'#Salesforce',
    r'BlackRock', r'Google', r'Microsoft', r'Amazon', r'Tesla',
    r'Apple', r'Facebook', r'Meta', r'IBM', r'Oracle',
    r'Uber', r'Lyft', r'Airbnb', r'Zoom', r'Salesforce',

    # Mapping and geospatial companies or organizations
    r'#Mapbox', r'#Esri', r'#HERE', r'#TomTom', r'#Garmin', r'#Mapillary',
    r'Mapbox', r'Esri', r'HERE', r'TomTom', r'Garmin', r'Mapillary',

    # Humanitarian and project-specific tags
    r'#HOTOSM', r'#hotosm', r'#MissingMaps', r'#RedCross', r'#UNICEF',
    r'Humanitarian OpenStreetMap Team', r'Missing Maps', r'Red Cross', r'UNICEF',

    # Add more as necessary or based on further findings
]

In [None]:
# Update the function with an expanded keyword list
def identify_company_related_comment_expanded(comment):
    # Expanded company-related keywords and hashtags
    company_keywords = [
        # Large corporations
        r'#BlackRock', r'#Google', r'#Microsoft', r'#Amazon', r'#Tesla',
        r'#Apple', r'#Facebook', r'#Meta', r'#IBM', r'#Oracle',
        r'#Uber', r'#Lyft', r'#Airbnb', r'#Zoom', r'#Salesforce',
        r'BlackRock', r'Google', r'Microsoft', r'Amazon', r'Tesla',
        r'Apple', r'Facebook', r'Meta', r'IBM', r'Oracle',
        r'Uber', r'Lyft', r'Airbnb', r'Zoom', r'Salesforce',

        # Mapping and geospatial companies or organizations
        r'#Mapbox', r'#Esri', r'#maproulette', r'#TomTom', r'#Garmin', r'#Mapillary',
        r'Mapbox', r'Esri', r'#Map4Bj', r'TomTom', r'Garmin', r'Mapillary',

        # Humanitarian and project-specific tags
        r'#HOTOSM', r'#hotosm', r'#MissingMaps', r'#redcross', r'#UNICEF',
        r'Humanitarian OpenStreetMap Team', r'Missing Maps', r'Red Cross', r'UNICEF',
    ]

    # Check if any keyword or hashtag is found in the comment
    for keyword in company_keywords:
        if re.search(keyword, comment, re.IGNORECASE):
            return 1  # Label as company-related
    return 0  # Label as individual-related

# Apply the updated function to the comments column
df['company_related'] = df['comment'].apply(identify_company_related_comment_expanded)

# Display a few examples of updated company-related comments
updated_company_related_comments = df[df['company_related'] == 1]
updated_company_related_comments[['id', 'comment']].head(10)


Unnamed: 0,id,comment
5,45226986,#hotosm-project-2403 #MissingMaps #SouthAfrica...
9,45332136,Adding junction nodes or bridges to overlappin...
11,45342825,#Map4Bj Buildings added
21,46196259,#maproulette Crossing_Ways:_Highway-Railway
25,46522668,#maproulette Crossing_Ways:_Highway-Railway
26,46607617,Adding junction nodes or bridges to overlappin...
41,47648029,#hotosm-project-2791 #MissingMaps #EliminateMa...
43,47835464,#maproulette Crossing_Type:_Highway-Railway
47,48167333,#hotosm-project-2668 #missingmaps #peru #redcr...
48,48415001,#hotosm-project-2945 #MissingMaps #EliminateMa...


The expanded keyword list successfully identified more company-related comments. Here are some examples:

- Comment 5: "#hotosm-project-2403 #MissingMaps #SouthAfrica..."
- Comment 9: "Adding junction nodes or bridges to overlapping highways #to-fix"
- Comment 41: "#hotosm-project-2791 #MissingMaps #EliminateMalaria"
- Comment 47: "#hotosm-project-2668 #missingmaps #peru #redcross"
- Comment 58: "added buildings #hotosm-project-2469 #MissingMaps"

These examples include relevant hashtags such as #MissingMaps, #HOTOSM, and specific project references that suggest organized or company-related contributions.

# Using NLP to enhance comment tagging

We can use techniques such as Named Entity Recognition (NER) to detect organization names in the text. Libraries like spaCy or transformers can help with this.

I'll use spaCy's pre-trained models to identify entities labeled as organizations in the comments. If spaCy detects an organization in the comment, we can label it as company_related.


In [None]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m85.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy

# Load the spaCy model for English
nlp = spacy.load("en_core_web_sm")

# Function to identify company-related comments using NER
def identify_company_related_comment_nlp(comment):
    doc = nlp(comment)
    # Check if any entities labeled as ORG are present
    for ent in doc.ents:
        if ent.label_ == "ORG":
            return 1  # Label as company-related
    return 0  # Label as individual-related

# Apply the function to the comments column
df['company_related_nlp'] = df['comment'].apply(identify_company_related_comment_nlp)

# Display a few examples of comments identified by NLP as company-related
nlp_company_related_comments = df[df['company_related_nlp'] == 1]
nlp_company_related_comments[['id', 'comment']].head(10)

Unnamed: 0,id,comment
5,45226986,#hotosm-project-2403 #MissingMaps #SouthAfrica...
11,45342825,#Map4Bj Buildings added
18,45869511,Ð¿Ð¾Ð´Ñ‡Ð¸ÑÑ‚ÐºÐ° Ð·Ð° Ð¿Ð¾Ð»ÑŒÐ·Ð¾Ð²Ð°Ñ‚ÐµÐ»...
33,47108063,Add street name from OS OpenData Locator
41,47648029,#hotosm-project-2791 #MissingMaps #EliminateMa...
45,47903213,Lakes and lakenames added
47,48167333,#hotosm-project-2668 #missingmaps #peru #redcr...
48,48415001,#hotosm-project-2945 #MissingMaps #EliminateMa...
58,49537012,added buildings #hotosm-project-2469 #MissingM...
62,49932242,added buildings #hotosm-project-2469 #MissingM...


I'll combine the result from both approach into one dataframe, outlining both comment index, id, content, and organization for easier access.

In [None]:
updated_company_related_comments.head(10)

Unnamed: 0,id,comment,new,technical,local,correctional,company_related,company_related_nlp,company_related_combined
5,45226986,#hotosm-project-2403 #MissingMaps #SouthAfrica...,1,0,0.0,0.0,1,1,1
9,45332136,Adding junction nodes or bridges to overlappin...,0,1,0.0,1.0,1,0,1
11,45342825,#Map4Bj Buildings added,1,0,0.0,0.0,1,1,1
21,46196259,#maproulette Crossing_Ways:_Highway-Railway,0,1,0.0,1.0,1,0,0
25,46522668,#maproulette Crossing_Ways:_Highway-Railway,0,1,0.0,1.0,1,0,0
26,46607617,Adding junction nodes or bridges to overlappin...,0,1,0.0,1.0,1,0,1
41,47648029,#hotosm-project-2791 #MissingMaps #EliminateMa...,0,1,0.0,1.0,1,1,1
43,47835464,#maproulette Crossing_Type:_Highway-Railway,0,1,0.0,1.0,1,0,0
47,48167333,#hotosm-project-2668 #missingmaps #peru #redcr...,1,0,0.0,0.0,1,1,1
48,48415001,#hotosm-project-2945 #MissingMaps #EliminateMa...,1,0,0.0,0.0,1,1,1


In [None]:
nlp_company_related_comments.head(10)

Unnamed: 0,id,comment,new,technical,local,correctional,company_related,company_related_nlp,company_related_combined
5,45226986,#hotosm-project-2403 #MissingMaps #SouthAfrica...,1,0,0.0,0.0,1,1,1
11,45342825,#Map4Bj Buildings added,1,0,0.0,0.0,1,1,1
18,45869511,Ð¿Ð¾Ð´Ñ‡Ð¸ÑÑ‚ÐºÐ° Ð·Ð° Ð¿Ð¾Ð»ÑŒÐ·Ð¾Ð²Ð°Ñ‚ÐµÐ»...,0,0,1.0,1.0,0,1,1
33,47108063,Add street name from OS OpenData Locator,1,0,1.0,0.0,0,1,1
41,47648029,#hotosm-project-2791 #MissingMaps #EliminateMa...,0,1,0.0,1.0,1,1,1
45,47903213,Lakes and lakenames added,1,0,1.0,0.0,0,1,1
47,48167333,#hotosm-project-2668 #missingmaps #peru #redcr...,1,0,0.0,0.0,1,1,1
48,48415001,#hotosm-project-2945 #MissingMaps #EliminateMa...,1,0,0.0,0.0,1,1,1
58,49537012,added buildings #hotosm-project-2469 #MissingM...,1,0,0.0,0.0,1,1,1
62,49932242,added buildings #hotosm-project-2469 #MissingM...,1,0,0.0,0.0,1,1,1


In [None]:
# combine nlp_company_related_comments with updated_company_related_comments
updated_company_related_comments = df[df['company_related'] == 1]
nlp_company_related_comments = df[df['company_related_nlp'] == 1]

# Combine the results into a single DataFrame and drop duplicates
combined_comments = pd.concat([updated_company_related_comments, nlp_company_related_comments]).drop_duplicates()

# Display the combined DataFrame
combined_comments.head(20)

Unnamed: 0,id,comment,new,technical,local,correctional,company_related,company_related_nlp,company_related_combined
5,45226986,#hotosm-project-2403 #MissingMaps #SouthAfrica...,1,0,0.0,0.0,1,1,1
9,45332136,Adding junction nodes or bridges to overlappin...,0,1,0.0,1.0,1,0,1
11,45342825,#Map4Bj Buildings added,1,0,0.0,0.0,1,1,1
21,46196259,#maproulette Crossing_Ways:_Highway-Railway,0,1,0.0,1.0,1,0,0
25,46522668,#maproulette Crossing_Ways:_Highway-Railway,0,1,0.0,1.0,1,0,0
26,46607617,Adding junction nodes or bridges to overlappin...,0,1,0.0,1.0,1,0,1
41,47648029,#hotosm-project-2791 #MissingMaps #EliminateMa...,0,1,0.0,1.0,1,1,1
43,47835464,#maproulette Crossing_Type:_Highway-Railway,0,1,0.0,1.0,1,0,0
47,48167333,#hotosm-project-2668 #missingmaps #peru #redcr...,1,0,0.0,0.0,1,1,1
48,48415001,#hotosm-project-2945 #MissingMaps #EliminateMa...,1,0,0.0,0.0,1,1,1


In [None]:
# Let's extract companies out using NER
# Function to extract company names using spaCy's NER
def extract_company_names_nlp(comment):
    doc = nlp(comment)
    companies = [ent.text for ent in doc.ents if ent.label_ == "ORG"]
    return ', '.join(companies) if companies else None

# Apply the function to the comments column
combined_comments['extracted_companies'] = combined_comments['comment'].apply(extract_company_names_nlp)

# Display a few rows with extracted company names
combined_comments[['id', 'comment', 'extracted_companies']].head(20)

Unnamed: 0,id,comment,extracted_companies
5,45226986,#hotosm-project-2403 #MissingMaps #SouthAfrica...,hotosm-project-2403
9,45332136,Adding junction nodes or bridges to overlappin...,
11,45342825,#Map4Bj Buildings added,Map4Bj Buildings
21,46196259,#maproulette Crossing_Ways:_Highway-Railway,
25,46522668,#maproulette Crossing_Ways:_Highway-Railway,
26,46607617,Adding junction nodes or bridges to overlappin...,
41,47648029,#hotosm-project-2791 #MissingMaps #EliminateMa...,"hotosm-project-2791, EliminateMalaria #Zimbabwe #"
43,47835464,#maproulette Crossing_Type:_Highway-Railway,
47,48167333,#hotosm-project-2668 #missingmaps #peru #redcr...,hotosm-project-2668
48,48415001,#hotosm-project-2945 #MissingMaps #EliminateMa...,"hotosm-project-2945, EliminateMalaria #Zimbabwe #"


In [None]:
# Function to extract the first hashtag as the company name using regex
def extract_first_company_name_regex(comment):
    # Find the first word after a hashtag and return only the first match
    match = re.split(r'#(\w+)', comment, maxsplit=1)
    return match[1] if len(match) > 1 else None


# Apply the function to the comments column
combined_comments['extracted_companies_regex'] = combined_comments['comment'].apply(extract_first_company_name_regex)

# Display a few rows with extracted company names from hashtags
combined_comments[['id', 'comment', 'extracted_companies_regex']].head(20)

Unnamed: 0,id,comment,extracted_companies_regex
5,45226986,#hotosm-project-2403 #MissingMaps #SouthAfrica...,hotosm
9,45332136,Adding junction nodes or bridges to overlappin...,to
11,45342825,#Map4Bj Buildings added,Map4Bj
21,46196259,#maproulette Crossing_Ways:_Highway-Railway,maproulette
25,46522668,#maproulette Crossing_Ways:_Highway-Railway,maproulette
26,46607617,Adding junction nodes or bridges to overlappin...,to
41,47648029,#hotosm-project-2791 #MissingMaps #EliminateMa...,hotosm
43,47835464,#maproulette Crossing_Type:_Highway-Railway,maproulette
47,48167333,#hotosm-project-2668 #missingmaps #peru #redcr...,hotosm
48,48415001,#hotosm-project-2945 #MissingMaps #EliminateMa...,hotosm


In [None]:
# Save the labeled data to a new CSV file
output_file = "labeled_comments.csv"
df.to_csv(output_file, index=False)

print(f"Labeled comments saved to {output_file}")

Labeled comments saved to labeled_comments.csv
