## Manuscript Keyword Search Program

In [None]:
# Note: may be some way to use this to get the token offset to get an appximate page number: https://www.nltk.org/api/nltk.text.html#module-nltk.text

In [1]:
%cd /Users/lisamcharron/Desktop/SMR_PythonProgram

/Users/lisamcharron/Desktop/SMR_PythonProgram


In [2]:
#import necessary packages
from pathlib import Path
from tkinter.filedialog import askopenfilenames
import nltk
from nltk.tokenize import wordpunct_tokenize
from nltk.text import Text, TokenSearcher
import re
from pdfminer.high_level import extract_text, extract_pages
from IPython.core.display import display, HTML

  from IPython.core.display import display, HTML


### Define functions

In [3]:
def all_pdfs_in_folder(folder_path):
    return Path(folder_path).glob("*pdf")

def context_search_string(keyword):
    return f"<.*>{{0,50}}<{keyword}.*><.*>{{0,50}}" 

def find_contexts(text, keyword):
    search_formatted = context_search_string(keyword)
    tokens = wordpunct_tokenize(text)
    textList = Text(tokens)
    searcher = TokenSearcher(textList)
    contexts = searcher.findall(search_formatted)
    contexts = [" ".join(c) for c in contexts]
    return contexts

def count_pages(pdf_path):
    page_count = len(list(extract_pages(pdf_path)))
    return page_count

def estimate_page_number(text, context, page_count):
    # location = text.find(context) / len(text)
    # approx_page = page_count*location
    # return approx_page
    return text.find(context)

def html_highlight_by_keyword(keyword, context):
    span_color = "<span style='color:red'>{colored_text}</span>"
    pattern = f"({keyword}\w*)"
    replace = span_color.format(colored_text=r"\1")
    return re.sub(pattern, replace, context, flags=re.IGNORECASE)

def html_matches_contexts_table(keywords_and_contexts, pages, title):
    table_header = f"""
    <link rel="stylesheet" href="https://cdn.simplecss.org/simple.min.css">
    <h1>{title}</h1>
    <table>
        <tr>
            <th>Keyword</th>
            <th>Approx. Page</th>
            <th>Context</th>
        </tr>
    """
    table_row = """
        <tr>
            <td>{keyword}</td>
            <td>{page}</td>
            <td>{context}</td>
        </tr>
    """
    table_rows = [table_row.format(keyword=k, page=p, context=c) for (k, c), p in zip(keywords_and_contexts, pages)]
    return table_header + "".join(table_rows) + "</table>"

def to_html_file(file_path, html_string):
    with open(file_path, "w") as f:
        f.write(html_string)

### Get file paths to iterate through

In [4]:
pdf_folder = Path("./manuscripts")
pdf_paths = [pdf_path for pdf_path in all_pdfs_in_folder(pdf_folder)]
pdf_paths

[PosixPath('manuscripts/SMR-23-0074_Proof_hi.pdf'),
 PosixPath('manuscripts/SMR-23-0073_Proof_hi.pdf'),
 PosixPath('manuscripts/SMR-23-0072_Proof_hi.pdf'),
 PosixPath('manuscripts/SMR-23-0075_Proof_hi.pdf'),
 PosixPath('manuscripts/SMR-23-0078_Proof_hi.pdf'),
 PosixPath('manuscripts/SMR-23-0079_Proof_hi.pdf'),
 PosixPath('manuscripts/SMR-23-0077_Proof_hi.pdf'),
 PosixPath('manuscripts/SMR-23-0070_Proof_hi.pdf'),
 PosixPath('manuscripts/SMR-23-0071_Proof_hi.pdf'),
 PosixPath('manuscripts/SMR-23-0076_Proof_hi.pdf')]

### Input keywords

In [5]:
keywords = ["author", "blind", "mask", "redact", "remov", "omit", "xxx", "anon", "replicat", "repro", "repos", "code", "syntax", " script", "package", "program", "osf", "figshare", "github", "cran", "stata", "python", "availab"]

### Run program

In [6]:
tables = []
for pdf_path in pdf_paths:
    page_count = count_pages(pdf_path)
    text = extract_text(pdf_path)
    text = text.lower()
    keywords_and_contexts = []
    for keyword in keywords:
        contexts = find_contexts(text, keyword)
        for context in contexts:
            context_highlighted = html_highlight_by_keyword(keyword, context)
            keywords_and_contexts.append([keyword, context_highlighted])
    title = pdf_path.stem
    pages = [estimate_page_number(text, c, page_count) for k, c in keywords_and_contexts]
    table = html_matches_contexts_table(keywords_and_contexts, pages, title)
    tables.append(table)
html = "".join(tables)
to_html_file("./index.html", html)
display(HTML(html))          

Keyword,Approx. Page,Context
remov,-1,"53 54 55 56 57 58 59 60 o r p accordingly . for sampling purposes , however , it is not necessary for researchers to search the sampling frame for inactive or miscategorized cases . instead , if researchers discover an ineligible case among their sample , they can remove it from the sample and replace it with a new case drawn at random from the sampling frame ( see harter et al ., 2010 : 173 - 4 ). alternatively , researchers could address overcoverage by oversampling ( kalton , 2021 ). the second limitation concerns duplication error :"
remov,-1,"the past year , and if so , request the address of its previous location . then , researchers can search the sampling frame for duplicate cases of those organizations . if they discover a duplicate in the sampling frame , they can make the population count more accurate by removing the duplicate ; specifically , the case https :// mc . manuscriptcentral . com / smr sociological methods and research page 16 of 45 with the less - current location information . they can also correct the duplicate case ’ s doubled likelihood of being selected by applying a weight"
remov,-1,"from the sampling frames it produces , researchers can draw multiple samples of varying sizes without f o r p increasing cost . in addition , because the method allows researchers to specify the geographic area and search criteria , researchers can save time by not having to identify and remove organizations outside their scope of study . e e furthermore , the information that traditional sampling frame sources contain is often outdated because they are typically updated and published no more than once per year . in contrast , because r the information on google maps is continuously updated ,"
omit,-1,"concerns undercoverage error : not including organizations that are in the target population ( eligible cases not included in the sampling frame ). the types of eligible but f o r p erroneously excluded cases include 1 ) organizations that are not on the text layer of google maps ( omitted cases ) and 2 ) organizations that google maps miscategorized as not being the target population type . for example , the congregation sampling frame we generated does not include a e e particular congregation because google maps miscategorized it as a food bank . our analysis indicates that google"
omit,-1,"kobayashi et al ., 2018 ), and because it allows users to correct organizations ’ information on google maps . undercoverage will create an undercount of the target population . however , researchers can conduct analyses similar to those discussed above to estimate the percentage of eligible organizations that were omitted or miscategorized , and then revise the estimated target population size accordingly . in addition , google maps ’ very low undercoverage error and our analysis of its representativeness https :// mc . manuscriptcentral . com / smr 1 2 3 4 5 6 7 8 9 10 11 12"
repos,-1,"and sample sources possess significant limitations that undermine their coverage , representativeness , and utility . to address these challenges , we developed a new method for generating organizational sampling frames that compiles publicly w available data contained on the google maps platform . google maps functions as a data repository that gathers information from multiple sources and continually organizes and updates its data . more specifically , google maps ’ “ places ” feature obtains and curates information about every organization on its platform . 2 the program we developed extracts this information to generate 1 the probability that an"
repos,-1,"the limitations associated with using google maps to generate sampling frames are minimal , especially compared to the limitations of alternative methods , and can be mitigated through supplemental analyses . in addition , sampling frames generated by this approach can be enhanced by applying it to other online data repositories that obtain their data from a variety of different sources ( bhutta , 2012 ). for example , yelp is an online directory of businesses that crowdsources its data from user - supplied information and reviews . alternatively , yellowpages . com f o r p gathers its information on"
repos,-1,"organizations primarily from phone companies and business owners . furthermore , an increasing number of organizations , such as safegraph and esri , are building open data platforms and compiling places datasets . although the data on the google maps platform e e are the most comprehensive , using multiple repositories that obtain their data from complementary sources allows for cross - checking and could further increase the coverage and representativeness r of the sampling frames that are generated . r e v i e moving forward , by making our program available to researchers will help to improve the quality"
code,-1,"frames from google maps o r p in this section , we describe the python - based program we developed to generate sampling frames by extracting data from the google maps platform on particular organizations of interest e e within specified geographic areas ( see appendix c for the python code used for this study ). the program uses the google maps places api , a service researchers can access using a software r program that returns requested data from the platform in a structured uniform format ready to analyze ( see appendix a for links to the google maps place"
code,-1,"research opportunities . american behavioral scientist , 45 ( 10 ), 1523 - 1549 . chaves , m ., anderson , s ., eagle , a ., hawkins , m ., holleman , a ., & roso , j . ( 2020 ). national congregations study : cumulative datafile and codebook . duke university . dennis , j . m . ( 2019 ). technical overview of the amerispeak panel norc ’ s probability - based household panel . norc at the university of chicago . firth , d . ( 1993 ). bias reduction of maximum likelihood estimates . biometrika"

Keyword,Approx. Page,Context
author,-1,"16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 declaration of conflicting interests the author ( s ) declared no potential conflicts of interest with respect to the research , authorship , and / or publication of this article . funding the author ( s ) received no financial support for the research , authorship , and / or publication of this article . f"
anon,-1,", where similarity is the ratio of within - cluster distances to between - cluster distances ( arturo et al . 2018 ; petrovic 2006 ). methods f o r data characteristic p the cavi questionnaire was the primary research tool used to collect data . the e survey was anonymous , with a sample size of 600 people ( 377 women and 223 men ). r e the survey was designed to investigate the value systems of women and men r e representing the "" silver "" generation of employees and it included respondents aged 50 and up who are"
availab,-1,"use of this technique increases the interpretability of data and minimize the loss of information . pca creates new variables that are uncorrelated and maximizes variance . this method is an adaptive data analysis technique , due to the fact that the search for principal components is based on the available dataset , and the solution is the eigenvalue problem ( jollife & cadima , 2016 ). agglomerative hierarchical clustering agglomerative hierarchical clustering ( ahc ) is a family of different methods f that are related to each other at the computational level . all of the methods in this o"

Keyword,Approx. Page,Context
author,-1,"voice is heard , which otherwise was limited to elite mass media . mass o media outlets are regarded as inclined towards their own point of view [ 1 , 2 ]. the alternative opinion expressed by social media provides scope for balancing r p eﬀect , neutralising slanted , authoritative media voices . although , news media e eﬀectively creates a echo chamber eﬀect within the social media reactions where e people selectively amplify or supplement their own belief inside a closed system r that they choose to create , insulating them from rebuttal . digital platforms r can be"
author,-1,"list of newspapers . the newspa - per data collection process is programmed once in 24 - hour intervals . each news article contains the following ﬁelds : article title , date of publication , publisher /- name of the newspaper ( the guardian , the independent , etc .), author , image urls , article url , keywords , video urls , article summary and the full text . f similar to twitter , newspaper scrapping uses python beautifulsoup apis to o identify html elements ( like title , author , etc .) from each of the article urls ."
mask,-1,"19 as pan - demic , during the period of 13 - 19 march 2020 ( week - 2 ), covid - 19 rapidly spread across the world . still , social media opinion seems to have been divided between topics like the restriction of movement and the eﬀectiveness of masks . as expected , panic , anxiety , and loneliness soon settled in the social media sphere , however . the few most popular tweets were : covid19 is real .. i ’ m on days home isolation , this whole working from home is going to change things and"
remov,-1,"the resultant data can be jointly analysed through nlp techniques like n - grams , parts of speech , lemmatisation , sentiment classiﬁcation , similarity , etc ., and computer vision tools . the core functionalities of map include the aggregation of multiple twitter searches or news articles and the removal of unwanted noisy data . existing social media analysis tools provide insights that simply fail to aggregate temporal modalities 10 https :// mc . manuscriptcentral . com / smr page 11 of 29 sociological methods and research 1 2 3 4 5 6 7 8 9 10 11 12 13"
repro,-1,"from here , it is pos - sible to undertake case studies to identify the dynamics of growth , decay , and re - emergence of trending topics during the covid - 19 pandemic situations . 8 . growth dynamics the growth dynamics of information diﬀusion in media through the repro - duction , ampliﬁcation , and recontextualisation of the original data follow the 11 https :// mc . manuscriptcentral . com / smr 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27"
repro,-1,44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 sociological methods and research page 12 of 29 same principle as the spread of infectious diseases like covid - 19 . the info - demic spreads intrinsically through the same model as the reproduction num - ber ( r0 ) in pandemics . with the r0
code,-1,"related to donald trump but not related to coronavirus and contain both images p and videos posted from washington . commercially available tools oﬀer easy - to - e use web - based interfaces that allow users to graphically interact with the system e without the need to write software code [ 7 , 8 ]. applications , therefore , can easily r be used by scientists without any knowledge of data science or programming r languages . nevertheless , in - depth analysis often requires custom functionality e that extends beyond the out - of - the - box features"
program,-1,"easy - to - e use web - based interfaces that allow users to graphically interact with the system e without the need to write software code [ 7 , 8 ]. applications , therefore , can easily r be used by scientists without any knowledge of data science or programming r languages . nevertheless , in - depth analysis often requires custom functionality e that extends beyond the out - of - the - box features that exist within the tools . for this reason , advanced analytics tools should be extendable to incorporate the custom data science logic through"
program,-1,"structured query language ( sql ) or programming languages like python and java . w many existing platforms [ 9 , 10 , 11 ] oﬀer sophisticated image analysis like re - verse image lookup . visual evidence of the number of times an image is used across social media platforms builds a much more powerful metric ."
program,-1,"with twitter data generation , newspaper data generation is an automated backend process . a python process scheduled through the linux cron job incrementally updates the database with all the news articles published on that day from a list of newspapers . the newspa - per data collection process is programmed once in 24 - hour intervals . each news article contains the following ﬁelds : article title , date of publication , publisher /- name of the newspaper ( the guardian , the independent , etc .), author , image urls , article url , keywords , video urls ,"

Keyword,Approx. Page,Context
author,-1,"1 ): 120 – 145 , 2013 . r r e v i e m . höhle and l . held . bayesian estimation of the size of a population . technical report , discussion paper , 2006 . y . hou and r . truex . ethnic discrimination and authoritarian rule : an analysis of criminal sentencing in china . available at ssrn 3481448 , 2020 . w q .- y . jiang . court delay and law enforcement in china : civil process and economic perspec - tive . springer science & business media , 2008 . j ."
author,-1,"#˚ l ¯, pl ¯, ⇣ à , ’ ö „ company , limited liability company , lim - h , ïd , %⇢ gg , fb , àc , — ä ited company , establishment , legal rep - support resentative , investment , business license , written , authorization , technology yó , ˙ æ , ( 0 , y �, ˙ æyó , ƒ ✓, ° project , construction , land use , approval , y , øéqõ , ✏¡, ïd construction project , planning , approval , environmental impact , opinion , invest - ment ��, µº"
remov,-1,"political sensitivity . for example , the british and irish legal information w institute , which makes court data from the united kingdom and ireland available on - line , does not allow its cases to be indexed by google , in the event that one of the cases is removed for privacy reasons . 4 in part for this reason , the supreme people ’ s court of china has extensive regulations on what cases are allowed to be uploaded to the database and which should not be uploaded at all , and recently journalists have 1the people ’ s"
remov,-1,"21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 noticed that cases that were once uploaded have been quietly removed ( xie , 2021 ). 5 further , data about individual cases can be incomplete , missing important pieces of metadata such as dates , legal process , or information about the facts of the case and parties that can be essential for understanding context . for scholars , practitioners"
remov,-1,"for cases where this information is missing . we extracted information such as case id , court , and court subdivision from 42 . 4 million judicial decision documents in our database , using a parsing script . 14 we then subset the data by court subdivision . 15 we removed duplicate cases from the data by removing documents that contain the same year , court , court subdivision , and case serial number . 16 this left us with one document representing each case within our data . for the period from 2013 - 2017 , our dataset contains a"
remov,-1,"- subdivision - year ( c , d , y ) from the unstructured text , as described in the appendix . 20 to estimate the total number of cases , we as - sume a constant rate of case intake for each batch across all working days . we remove all weekends and public holidays and take into account all other ofﬁcial holiday ad - justments , which leaves us with 250 working days total for all years . 21 we then use ordinary least squares to ﬁt a line between the working day wc , d , y ,"
remov,-1,"since chinese litigation laws also require courts to decide whether or not to accept administrative and civil cases within 7 days after they are ﬁled ( 15 days for criminal cases ). we use the ﬁling date as the registration date when the acceptance date is not available . 20we removed registration dates and case numbers that were clearly errors , such as registration dates in the wrong year . before estimation , we also removed extreme outliers in the relationship between the start date and serial number to reduce the impact of data errors on our estimation . 21leap years"
remov,-1,"dates are missing at random after condition - ing on serial number . since we do not see a systematic pattern of which registration dates are missing in terms of the type of case within a court subdivision , we think this is a reasonable assumption . 36importantly , we remove all holidays and weekends from the date variable in order to avoid predict - ing registration dates on days when the courts are not open . 37we use leave - one - out cross - validation for all administrative batches and batches that have fewer than 50 cases . for"
remov,-1,"19 ( 2 ): 15 – 40 , 2019 . l . vereeck and m . mühl . an economic theory of court delay . european journal of law and economics , 10 ( 3 ): 243 – 268 , 2000 . e . xie . millions of court rulings removed from ofﬁcial chi - nese database . south china morning post , jun 26 : https :// www . scmp . com / news / china / politics / article / 3138830 / millions -- court -- rulings -- removed -- official -- chinese -- database , f 2021 ."
remov,-1,". compared to the current cjo web - e site , we are missing a signiﬁcant number of civil ﬁrst instance ﬁles and civil rehearing ﬁles . however , the current cjo website has fewer criminal and administrative cases than we have in our database , likely due to the removal of these cases as noted in xie w ( 2021 ). b . 1 first instance cases table 6 shows that cjo currently has 9 - 12 % more civil ﬁrst instance ﬁles than we do in the period 2013 to 2015 , 20 % more for 2016 , and"

Keyword,Approx. Page,Context
author,-1,"inconsistent ( e . g ., peycheva et al . 2021 ; sala et al . 2012 ). individual survey w respondents appear to have a latent “ willingness to consent ” in multiple consent requests asked within one interview ( jenkins et al . 2006 ; mostafa 2016 ; author reference ) but evidence of a latent willingness to consent over time is weaker ( mostafa and wiggins 2018 ) and many respondents who decline to consent give the opposite answer if asked again at a later date ( weir et al . 2014 ; jӓckle et al . 2021a"
author,-1,"conceptual thinking and the research design for this paper were informed by qualitative in - depth interviews , in which respondents from understanding society : the uk household longitudinal study were asked how they came up with their decision to consent or not to consent to administrative record linkage ( author 4 https :// mc . manuscriptcentral . com / smr page 5 of 55 sociological methods and research 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32"
author,-1,"methods and research page 8 of 55 ( how the information is presented ) of the request are important here . the survey mode in which the request is delivered and the role of the agent ( interviewer ) in delivering the request also play a role ( see ( author reference ) for a detailed examination of differences in the decision process between face - to - face and web respondents ). much of the experimental research on consent has focussed on manipulating the content and format of the request ( e . g ., gain versus loss framing ,"
author,-1,"longer versus shorter descriptions of the linkage process , etc .). this literature , which is reviewed in ( author reference ), has found small and often contradictory effects on consent . [ figure 1 about here ] f o in the conceptual framework described in figure 1 , the decision process r mediates the effect of these background variables on outcomes , including the decision p itself and comprehension"
author,-1,"for face - to - face and web r respondents , we refer to ip11a and ip11b , respectively , but our focus is on the r replication of findings across samples with different characteristics . for further details e v i e of the mode experiment , see ( author reference ). in the current paper , we define samples by mode of response , rather than assigned mode . w since the sample size constrained the number of experimental treatment groups we could implement , additional surveys were fielded using an access panel ( author reference ). the populuslive"
author,-1,"sample was selected independently in december 2019 and surveyed only once . we refer to this survey as ap2 . the implementation of these surveys was led by natcen social research , in collaboration with the populuslive panel . the ap samples included other experiments not reported on here ( author references ). table 1 presents a summary of the key features of the five samples used in our analyses . additional information on response and participation rates is presented in s1 appendix . f [ table 1 about here ] r o ethical approval for wave 11 of the innovation"
author,-1,"17 / 04 / 2018 . 2 . 2 . measures e v i e 2 . 2 . 1 . outcomes : consent , understanding , confidence w the key outcome we examine is consent to link to income , employment and tax records held by the uk tax authority , hm revenue and customs ( hmrc ). all respondents were asked this question and shown an information leaflet and a flowchart visualising the data linkage process . they were not told about the data linkage request in the invitation to the survey . the full wording of the question"
author,-1,26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 we report on the decision process for multiple consent questions in a separate paper ( author reference ). at the end of the ap surveys respondents were told that their data would not actually be linked as the purpose of the survey was to collect information about the attitudes and concerns of the general public about data sharing . the ip data will however be linked
omit,-1,"to ap2 : consequential ( alone ), usual ( alone ) and gut ( alone ). f - tests stem from regressions of variables in first column ( one per row ) on the three decision processes . respondents who reported “ something else ” or less frequent patterns are omitted from the analysis . 2 . in counting factors considered , we treat a factor as ‘ considered ’ if the respondent rated it as 4 or 5 on the five - point response scale . f o r p e e r r e v i e w 41"
anon,-1,". the hmrc records contain information about your current and previous employment , your income , national insurance contributions and tax credits . • we will not send hmrc the answers you have given in this study . • hmrc will send us your records . these will contain an anonymous identification number but not your name , address , sex or date of birth . • we will add the hmrc records to the answers you have given in this study . f • we will make the combined anonymous data available for academic and o policy research purposes only"

Keyword,Approx. Page,Context
omit,-1,". thus , 𝑌𝑖𝑡 ( 𝑋𝑖𝑡 = 1 ), represents individuals ’ internalizing behaviors had individual been 𝑖 victimized at time , and 𝑡 𝑌𝑖𝑡 ( 𝑋𝑖𝑡 = 0 ) represents individuals ’ internalizing behaviors had the same individual not been victimized at time t3 . subscripts and might be omitted for w 𝑡 𝑖 𝑖 simplicity . longitudinal data allows researchers to estimate different causal quantities because they can make different counterfactual comparisons . one important quantity of interest is what blackwell and glynn ( 2018 ) refer to as the contemporaneous effect of treatment ( cet ) of on"
replicat,-1,"doi : 10 . 1097 / ede . 0000000000000078 . f o r petersen , maya l ., yue wang , mark j . van der laan , and david r . bangsberg . 2006 . “ assessing the effectiveness of antiretroviral adherence interventions : using marginal structural models to replicate the findings of randomized controlled trials .” jaids journal of acquired immune deficiency syndromes 43 ( supplement 1 ): s96 – 103 . doi : 10 . 1097 / 01 . qai . 0000248344 . 95135 . 8d . e p e pratt , travis c ., jillian j ."
code,-1,"), all manipulations , and all measures r e v i e in the study , are properly reported . the dataset used in the empirical analysis is publicly available and can be downloaded here https :// nces . ed . gov / ecls / dataproducts . asp . analysis code is included in the supplemental materials . data were analyzed using stata , version 17 . this study ’ s w design and its analysis were not pre - registered . results the contemporaneous effect of peer victimization the contemporaneous treatment effect of peer victimization on internalizing behaviors was estimated"
code,-1,", mary c . hagedorn , peggy daly , and michelle najarian . 2015 . “ early childhood longitudinal study , kindergarten class of 2010 - 11 ( ecls - k : 2011 ). user ’ s manual for the ecls - k : 2011 kindergarten data file and electronic codebook , public version . nces 2015 - 074 .” national center for education statistics . r p e ttofi , maria m ., david p . farrington , friedrich lösel , and rolf loeber . 2011 . “ do the victims of school bullies tend to become depressed later in"
stata,-1,"in the study , are properly reported . the dataset used in the empirical analysis is publicly available and can be downloaded here https :// nces . ed . gov / ecls / dataproducts . asp . analysis code is included in the supplemental materials . data were analyzed using stata , version 17 . this study ’ s w design and its analysis were not pre - registered . results the contemporaneous effect of peer victimization the contemporaneous treatment effect of peer victimization on internalizing behaviors was estimated using a traditional fixed effects model ( equation 2 ), which controls"
availab,-1,"data journal : sociological methods and research manuscript id smr - 23 - 0079 manuscript type : original article f keywords : causal inference , longitudinal analysis , estimands , contemporaneous effect , cumulative effect , long - term effect o abstract : r p e e despite a growing availability of longitudinal datasets , it can be difficult to select the most appropriate modelling strategy . in particular , there is little guidance regarding which causal questions we can ask using longitudinal data , and what is the best way to answer these questions . this paper distinguishes between three"
availab,-1,"25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 asking – and answering – causal questions using longitudinal data abstract despite a growing availability of longitudinal datasets , it can be difficult to select the most appropriate modelling strategy . in particular , there is little guidance regarding which causal f o r p questions we can ask using longitudinal data , and what is the best way to answer these questions . this"
availab,-1,"w internalizing behaviors . https :// mc . manuscriptcentral . com / smr sociological methods and research page 2 of 45 asking causal questions introduction researchers in social and behavioral sciences often use longitudinal data to estimate causal effects using observational data . yet the increasing number of modelling strategies available make it hard to understand what specific questions we can ask – and answer – when we have repeated measures of the same units over time . the goal of this paper is to provide an accessible overview of different quantities we can estimate using longitudinal data , and describe"
availab,-1,"33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 asking causal questions learn ) should come first . then , one can assess whether the causal quantity can be identified from available data , and – if that is the case – choose an appropriate model to estimate that quantity ( lundberg et al . 2021 ; petersen and van der laan 2014 ). distinguishing between the causal quantity of interest ( estimand ) and the statistical model used to estimate that"
availab,-1,". transparency and openness o r p e e r the analytic sample size , all data exclusions ( if any ), all manipulations , and all measures r e v i e in the study , are properly reported . the dataset used in the empirical analysis is publicly available and can be downloaded here https :// nces . ed . gov / ecls / dataproducts . asp . analysis code is included in the supplemental materials . data were analyzed using stata , version 17 . this study ’ s w design and its analysis were not pre -"

Keyword,Approx. Page,Context
author,-1,"has a rather homogeneous cultural base . individual freedom of expression is frequently viewed as socially unacceptable in the hierarchical , holistic cultural framework of f confucianism , and the taoist middle ground it contains , which emphasizes harmony , unity , and o solidarity as well as respect for authority , is significantly different from that of western societies r p ( yang , 2004 ). in this context , it is clear that traditional chinese cultural ideas and customs are e irreconcilable with the requirements of a market - oriented society that places a strong emphasis on r e"
script,-1,"40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 sociological methods and research page 30 of 40 zuo , y . h ., ( 2011 ). the success and failure of the revolution of 1911 . red flag script , ( 06 ), 25 - 27 + 1 30 f o r p e e r r e v i e w https :// mc . manuscriptcentral . com / smr page 31 of 40 sociological methods and research 1 2 3 4 5 6 7 8 9 10"

Keyword,Approx. Page,Context
author,-1,"kete , lele , chokwe , and kongo . in her paper on lineage ties and domestic violence in ghana , asiedu ( 2016 ) used ethnicity as proxy for lineage by creating a dummy variable that categorized all akans as matrilineal and non - akans as patrilineal . the author acknowledged the limitations of using ethnicity as proxy for the lineage but added that there was no question in the data to measure lineage . blinded for review ( 2016 ) used a similar strategy to model the effects of kin group affiliation on marital f violence against women in"
author,-1,". y ., & djamba , y . ( 2018 ). ethnicity and gender - based violence in kenya . journal of family issues , 39 ( 11 ), 2958 - 2981 . stoeltje , b . j . ( 2006 ). asante queen mothers : a study in female authority . annals of the new york academy of sciences , 810 , 41 - 71 . takyi , b . k ., & mann , j . ( 2006 ). intimate partner violence in ghana , africa : the perspectives of men regarding wife beating . international journal of sociology"
blind,-1,"saharan africa has expanded significantly . this is certainly the case in ghana where the majority of studies have focused on domestic and intimate partner abuse against women ( cantalupo et al . 2006 ; takyi & mann 2006 ; adjei , 2016 ; dery et al . 2022 ; blinded for review ). as women continue to be marginalized in patriarchal settings such as ghana , studies have attempted to unravel the gendered inequities and power asymmetries that increase their vulnerabilities to domestic and intimate partner violence ( ipv ). for instance , some studies have focused on women ’"
blind,-1,"and discriminate against women , entrenched gender norms that perpetuate gender inequality , and women ’ s lack of e e autonomy and self - efficacy in the domestic space ( fulu & miedema 2015 ; united nations o r p women 2012 ; cardoso et al . 2016 ; blinded for review ). r in recent years , debates on ipv in sub - saharan africa and ghana have evolved to include r e v i e the contributions of key indigenous structures to women ’ s victimization . some researchers associate polygynous family arrangements — a key feature of"
blind,-1,"traditional african societies — to w women ’ s risk of ipv ( amo - adjei & tuoyire 2016 ; blinded for review ; jansen et al . 2019 ; behrman 2019 ). others suggest ethnic and lineage ties increase women ’ s risks of experiencing ipv ( sitawa et al . 2018 ; asiedu 2016 ; lowes 2018 ; blinded for review ; campbell & mace , 2022 ). for"
blind,-1,"2008 ghana demographic https :// mc . manuscriptcentral . com / smr sociological methods and research page 4 of 29 and health survey ( dhs ), showed women from patrilineal societies were significantly more likely to experience physical violence than women from matrilineal societies . with the same data , blinded for review found married women from patrilineal societies reported higher levels of physical and sexual violence than those from matrilineal societies . however , matrilineal women reported higher levels of emotional violence . blinded for review ( 2016 ) used qualitative data to explore comparative differences in the experiences of"
blind,-1,"ethnicity as proxy for lineage by creating a dummy variable that categorized all akans as matrilineal and non - akans as patrilineal . the author acknowledged the limitations of using ethnicity as proxy for the lineage but added that there was no question in the data to measure lineage . blinded for review ( 2016 ) used a similar strategy to model the effects of kin group affiliation on marital f violence against women in ghana . with no data on lineage , they categorized the akan ethnic group as matrilineal and all other ethnic groups as patrilineal . o r"
blind,-1,". the analytic sample was limited to 1 , 624 participants who provided answers to the questions on lineage and experiences with ipv . data collection and protocol data for this study were collected by eight team members including six research assistants ( ras ) recruited and trained at the blinded for review . two ras were assigned to each ecological zone . all ras could speak english , but to facilitate data collection , they also f o r p spoke the local languages specific to their communities . although all ras had previous data collection experience , several training"
blind,-1,"sessions were held at blinded for review to hone their skills . language competence , combined with in - depth knowledge of data collection , e e expedited the process , as ras easily established rapport with the study participants . questionnaires for data collection were pre - tested with about 3 % of the"
blind,-1,"and modified r accordingly . participants used in the pre - testing phase of data collection were not included in the final sample . face - to - face interviews were used to solicit information from research participants . before data collection , ethical clearance was received from the [ blinded for w review ]. ras were trained to be sensitive to the emotional needs of study participants and adhere strictly to confidentiality and privacy requirements , given the sensitive nature of the topic . ras were also trained to adhere to the who ’ s recommendations on conducting research on"

Keyword,Approx. Page,Context
author,-1,". in these social contexts , the agents r can perform either actions that are tied to these specific contexts or actions that are common to them e all . these contexts constitute a highly dynamic social environment with many competing ways of r e behaving . according to the authors , their simulations show that the “ conformist agents ” whose actions v i e are based on a simple rule of local imitation are not able to converge in a stable way on one behavior and thus are not able to produce emergent population - level behavioral regularities that"
author,-1,"in a specific environment . andrighetto et al . ( 2010 , 14 ) explains that “ they converge en masse on single action [...] rapidly but unstably : conformity varies over time , depending on the actions of the other agents within each scenario and over time ”. the authors take this to indicate that mere statistical frequency of actions is not enough to achieve behavioral convergence over time that is a characteristic feature of social norms . https :// mc . manuscriptcentral . com / smr 1 2 3 4 5 6 7 8 9 10 11 12 13"
author,-1,"20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 sociological methods and research page 12 of 47 the authors argue that for shared population - level behavioral regularities to emerge in this emulated “ liquid society ” and “ multi - setting world ” ( campennì et al 2014 , 94 ; 101 ), some further effect in the internal cognitive processing of agents is needed . they test"
author,-1,"p in a similar vein to edmonds ( 2020 ), goldberg and stein ( 2018 ) build on the research on e connectionist constraint satisfaction networks . together with the theory of semantic cognition , where r semantic context is assumed to affect how new information is interpreted , the authors were able to e r formulate a two - stage transmission model of associative diffusion in order to explain the emergence e of patterned cultural variation . their simulation aims to describe how endogenous cultural v differentiation can be entirely attributed to the transmission dynamics driven by the agents ’"
replicat,-1,"“ a gradual process in which the emergent effects determine new mental mechanisms in r the agents involved , who are not necessarily aware of the effects produced ”. in addition , they contend e that : v i e while in second - order emergence the global effect is replicated because agents w perceive it , immergent effects consist of a new set of mental and behavioral properties that ensure the reproduction of the global effect independently of the agents ’ awareness . in other words , some effects of social systems are reproduced even though , or just because"
replicat,-1,"is some sorting of agents , so agents who are connected are more likely to hold similar beliefs , but the fixed [ social ] network structure limits the extent to which this can occur ” ( edmonds 2020 , 502 ). with combined modelling , edmonds ( 2020 ) replicates empirical data concerning the noisy and non - smooth https :// mc . manuscriptcentral . com / smr 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32"
repro,-1,"means that some modification of a mental mechanism , representation or process in the agents ’ minds is required for the social phenomenon to emerge , since this modification leads the agents to transform their behavior in conformity with the emerging effect and , thereby , to contribute to its reproduction in future , essentially forming a “ complex recursive loop ” ( andrighetto , conte and turrini 2007 , 4 ). f o conte and her collaborators ( conte , andrighetto and campenni 2009 ; conte et al . 2014 ) further argue r that this complex recursive loop can"
repro,-1,"is a more complex and gradual form of v i incorporative downward causation process . it occurs when the agents incorporate the emergent effect e more deeply into their mental representations and mechanisms that control their actions , thereby w further increasing the probability for the emergent effect to be reproduced . immergence may occur without agents ’ awareness or understanding of these effects . perhaps the most elaborated example of the latter complex recursive loop is the immergence of social norms that conte and her collaborators have discussed in a series of papers ( conte et al 2007 ; conte"
repro,-1,"necessarily aware of the effects produced ”. in addition , they contend e that : v i e while in second - order emergence the global effect is replicated because agents w perceive it , immergent effects consist of a new set of mental and behavioral properties that ensure the reproduction of the global effect independently of the agents ’ awareness . in other words , some effects of social systems are reproduced even though , or just because , they are not transparent . ( andrighetto and conte 2014 , 24 ) only this kind of immergence of the emerging"
repro,-1,"46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 later . before moving on , we will make a few short comments on the general methodological advantages of the reviewed approach to agent - based modeling . simulations are usually made in order to reproduce some real - world data or target phenomena . this is useful as it not only allows us to verify and identify the relevant components and actors of the target system . it also provides a good way to investigate tipping points and transition periods in social processes as well"

Keyword,Approx. Page,Context
remov,-1,"17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 propensity to attend certain events constant removes within - individual variation , leaving only between - individual heterogeneity of preferences . for the yearly simulated network with no rewiring , we estimated the propensities for attending any type of event during the first year of a member ' s entry into the network and used that propensity"
omit,-1,"( 0 . 006 ) w 0 . 050 *** ( 0 . 008 ) - 0 . 600 *** ( 0 . 009 ) year event novelty num events attended ( log ) num network connections ( log ) observations 88 , 7241 25 , 0121 1some observations are omitted based on privacy settings of certain members that do not publicly list interest or group memberships , which are needed to calculate specialization scores * p < 0 . 05 ; ** p < 0 . 01 , *** p < 0 . 0 1 13 https :// mc ."
xxx,-1,"33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 in the second part of our explanation sketch where we focus on the social dynamics of emergence of the knowledge economy ( xxx , xxx and xxx 2023 ). the present article proceeds as follows : we first provide a sketch of meetup . com , an nonprofit platform from which online data for this paper was collected ; we next describe how we use these data to develop our method of measuring"
xxx,-1,"v i e clusters ( cometto and piol 2013 ; zukin 2020 . as a core economic institution of the 21st century knowledge economy , meetup . com has been instrumental in facilitating face - to - face knowledge w spillover events dedicated to showcasing and exchanging useful knowledge ( xxx and xxx 2020a ). individuals create their own profiles on the platform and can affiliate with numerous special interests by either listing their interests , joining specialized groups , or attending events . meetup ’ s online data offers a novel opportunity to measure various types of specialized knowledge and"
xxx,-1,was - darling - tech - industry - can - it - survive - wework - n1106676 3 https :// mc . manuscriptcentral . com / smr sociological methods and research page 4 of 27 during the formative period of emergence and rapid growth of two regional knowledge economies ( xxxx and xxxx 2020b ). figure 1 shows an example profile on meetup . com and potential fields that signal an individual ’ s knowledge specialization . we focus on the metropolitan areas of new york and los angeles to detail methods used in the measurement of social interactions of technologists
xxx,-1,"configurations . this facilitates evolutionary change because entire w modules can be rewired while maintaining their modular functions . analogously , within the knowledge economy , modularity enables innovative activity if actors in a knowledge cluster can bridge across multiple clusters of specialists who are progressively and dynamically differentiated . xxxx and xxxx ( 2020a ) longitudinal study of email threads of technologists in the emergence of new york city ’ s knowledge economy shows that social processes giving rise to 14 https :// mc . manuscriptcentral . com / smr 1 2 3 4 5 6 7 8 9 10"
xxx,-1,"the simulations change the way individuals attend events and thus the way individuals connect with one another through events . figures 2 and figure 3 together suggest that a major source of new knowledge within the tech communities over time is the entry of new members into the network ( xxx et al . 2023 ). if the influx of new members alone , rather than their preferences , is the main driver of structural change of the network of an emerging knowledge economy , we would expect to see a similar pattern of modularity if we simulated random event attendances"
xxx,-1,"36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 across meetup groups in our sample are characterized by increasing modularity over time , but introducing rewiring maintains higher between - group connectivity throughout ( xxx et al . 2023 ). f o r p e e figure 5 : observed network modularity and expected network modularity under entry with r r undifferentiated preferences over time for new york and los angeles tech communities . e community structure appears to increase over time despite the expected"
xxx,-1,"r p e e r r e v i e w as the knowledge economy matures , heterogeneous preferences among a growing pool of members facilitate the formation of social boundaries of modular communities , but rewiring tends to integrate the network by creating bridge ties across modular communities ( xxx and xxx 2020b ). at the micro - level , as individuals select which events to pursue and which to forgo , they are individually exposed to more diverse interests through other individuals they encounter at their events . at the macro - level , this type of selection leads"
xxx,-1,"fast unfolding of communitites in large networks .” journal of statistical mechanics : theory and experiment p10008 . cometto , maria teresa , and alessandro piol . 2013 . tech and the city : the making of new york ’ s startup community . new york : mirandola press . xxxxxx . 2020a . “ xxxxxx ” xxxx xxxx review ( anonymized reference ) f o mokyr , joel . 2002 . the gifts of athena : historical origins of the knowledge economy . r princeton : princeton university press . p xxx xxx . 2020b . “ xxxxxx ” xxxxxx"
